This tutorial covers the basic population genetics analyses
for a set of populations (or closely related species)
1) Evaluating, filtering and plotting genetic data
2) Test for Hardy-Weinberg-Equilibrium (HWE) and linkage between
loci
3) Calculating, assessing and plotting genotypic richness and diversity,
inbreeding and heterozygosity
4) Calculating and plotting genetic differentiation (Fst, Gst and Josts
D)
5) Calculating and plotting isolation-by-distance (IBD)
6) Performing and plotting principal component analyses (PCA)
7) Performing and plotting discriminant analyses of principal components
(DAPC) with cross-validation
8) Creating Structure-like plots and maps based on DAPC results
As input is required (see Import data paragraph
below)
1) a vcf file with the genetic SNP data
2) a dataset containing individual IDs, longitude, latitude and
population/species assignment for each sample/individual (it is
recommended to name individuals in SNP data exactly like IDs in dataset
to facilitate any filtering/subsetting data)
The tutorial uses an example dataset of unpublished ddRAD data (121 individiduals of 11 taxa) from the Hemileuca maia buck moth species group (Saturniidae: Lepidoptera) distributed throughout the US and southern Canada
## Ensure required installers are present
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
## Install and load all packages
install_if_missing <- function(pkg) {
if (!require(pkg, character.only = T)) {
install.packages(pkg, dependencies = T)
library(pkg, character.only = T)}}
install_if_missing("adegenet")
install_if_missing("conStruct")
install_if_missing("dplyr")
install_if_missing("geosphere")
install_if_missing("ggplot2")
install_if_missing("hierfstat")
install_if_missing("mmod")
install_if_missing("pegas")
install_if_missing("plotly")
install_if_missing("poppr")
install_if_missing("reshape2")
install_if_missing("StAMPP")
install_if_missing("stringr")
install_if_missing("vcfR")
install_if_missing("viridis")
## Install SNPRelate from Bioconductor
if (!requireNamespace("SNPRelate", quietly = TRUE)) {BiocManager::install("SNPRelate")}
library(SNPRelate)
Ignore any warnings that may come up (not shown in output here)
rm(list = ls()) #clear environment
setwd("C:/Users/danie/Desktop/PhD research/Tutorials/Population_genetics_tutorial")
#set working directory (containing vcf and datafile)
dataset <- read.table(file = "Hmg_data.txt", sep = "\t", header = T, stringsAsFactors = T) #import dataset containing columns: ID, Species, longitude, Latitude
genpop_data <- vcfR2genind(read.vcfR("Hmg_pop.vcf")) #import vcf file and convert to genind object
outgroup <- c("CA_OroGrandeWash_Hmg086", "CA_OroGrandeWash_Hmg087")
genpop_data <- genpop_data[!indNames(genpop_data) %in% outgroup, ]
dataset <- dataset[!dataset$ID%in% outgroup, ]
population_assignment <- dataset$Species #set population assignment
population_assignment <- factor(population_assignment, levels = unique(population_assignment))
genpop_data$pop <- droplevels(as.factor(population_assignment))
droplevels(as.factor(population_assignment))
head(dataset) #check dataset
## ID Species Latitude Longitude
## 1 NH_CheshireCo_Hmg129 H.lucina 42.92000 -71.22000
## 2 NH_CheshireCo_Hmg130 H.lucina 42.92000 -71.22000
## 3 NH_CheshireCo_Hmg308 H.lucina 42.92000 -71.22000
## 4 MA_Montague_Hmg314 H.maia.x.lucina 42.55740 -72.51770
## 5 IA_LoessHills_Hmg315 H.latifascia 41.92489 -95.99636
## 6 AL_DunavantValley_Hmg344 H.maia 33.44000 -86.61000
table(genpop_data$pop) #show taxa and number of samples per taxon
##
## H.lucina H.maia.x.lucina H.latifascia
## 9 1 11
## H.maia H.sp.GreatLakesComplex H.nevadensis
## 47 10 16
## H.slosseri H.peigleri
## 17 10
nLoc(genpop_data) #show number of loci
## [1] 5173
unique(genpop_data@pop) #show all species (or populations )
## [1] H.lucina H.maia.x.lucina H.latifascia
## [4] H.maia H.sp.GreatLakesComplex H.nevadensis
## [7] H.slosseri H.peigleri
## 8 Levels: H.lucina H.maia.x.lucina H.latifascia ... H.peigleri
poppr(genpop_data) #show basic summary table, including number of individuals (N) and genotypes (MLG) per population, Shannon-Wiener index (H; Shannon 2001, genotype diversity), Stoddart and Taylor's (1988) G index (G, genotype diversity) and Nei’s unbiased gene diversity (Nei 1978; Hexp)
## Pop N MLG eMLG SE H G lambda E.5 Hexp Ia
## 1 H.lucina 9 9 9 0.00e+00 2.20 9 0.889 1 0 60.4
## 2 H.maia.x.lucina 1 1 1 0.00e+00 0.00 1 0.000 NaN 0 NA
## 3 H.latifascia 11 11 10 0.00e+00 2.40 11 0.909 1 0 16.6
## 4 H.maia 47 47 10 0.00e+00 3.85 47 0.979 1 0 26.7
## 5 H.sp.GreatLakesComplex 10 10 10 0.00e+00 2.30 10 0.900 1 0 36.8
## 6 H.nevadensis 16 16 10 0.00e+00 2.77 16 0.938 1 0 36.9
## 7 H.slosseri 17 17 10 2.31e-07 2.83 17 0.941 1 0 44.0
## 8 H.peigleri 10 10 10 0.00e+00 2.30 10 0.900 1 0 29.2
## 9 Total 121 121 10 0.00e+00 4.80 121 0.992 1 0 28.8
## rbarD File
## 1 0.03413 genpop_data
## 2 NA genpop_data
## 3 0.00692 genpop_data
## 4 0.00758 genpop_data
## 5 0.01671 genpop_data
## 6 0.01486 genpop_data
## 7 0.01669 genpop_data
## 8 0.01454 genpop_data
## 9 0.00736 genpop_data
sort(private_alleles(genpop_data) %>% apply(MARGIN = 1, FUN = sum), decreasing = T) #show number of private alleles (alleles found exclusively in one population) per site across all loci for each population
## H.maia H.lucina H.sp.GreatLakesComplex
## 580 109 64
## H.nevadensis H.slosseri H.latifascia
## 63 52 32
## H.peigleri H.maia.x.lucina
## 2 0
missing_data <- poppr::info_table(genpop_data, type = "missing", plot = T) #plot missing data across loci and populations
round(missing_data[1:length(unique(genpop_data$pop)), 1:4], 2) #show proportion of missing data across populations for first four loci
## Locus
## Population 1337:7:+ 1344:7:- 1538:93:+ 1604:40:-
## H.lucina 0.11 0.56 0.33 0.00
## H.maia.x.lucina 1.00 1.00 1.00 0.00
## H.latifascia 0.00 0.00 0.27 0.09
## H.maia 0.06 0.09 0.13 0.04
## H.sp.GreatLakesComplex 0.10 0.10 0.30 0.00
## H.nevadensis 0.00 0.00 0.06 0.19
## H.slosseri 0.12 0.06 0.12 0.06
## H.peigleri 0.10 0.00 0.10 0.00
genpop_data <- poppr::informloci(genpop_data,
MAF = 0.01, #only retain loci with at least 1% of the minor allele (minor allele frequency cutoff to 5%)
cutoff = 2 / nInd(genpop_data)) #retain only loci with at least (2 / total number of individuals) differentiating genotypes are retained (a locus must have at least (2 / total number of individuals) of its genotypes be different from each other to be considered useful and being retained to ensure that only loci with enough variation are kept)
## cutoff value: 1.65289256198347 % ( 2 samples ).
## MAF : 0.01
##
## Found 544 uninformative loci
## ============================
## 60 loci found with a cutoff of 2 samples :
## 234597:20:+, 249309:9:+, 267729:15:+, 330753:6:-, 390364:6:+,
## 395275:11:-, 430560:27:+, 479991:37:-, 503404:39:+, 551262:11:-,
## 609753:42:+, 610035:10:+, 759254:12:-, 776354:11:+, 777079:11:-,
## 820465:37:+, 898922:27:-, 977991:13:-, 1055307:9:-, 1056600:19:+,
## 1080379:59:-, 1097298:4:-, 1098638:7:+, 1199902:12:-, 1210181:49:-,
## 1357125:30:-, 1382467:21:+, 1495896:20:-, 1532928:6:-, 1596396:10:+,
## 1643540:5:-, 1705783:44:+, 1717386:21:+, 1759208:13:-, 1761264:4:+,
## 1762811:10:+, 1828445:6:-, 1829645:27:+, 1948175:7:+, 1982183:6:-,
## 2013013:6:-, 2218908:8:-, 2307601:6:+, 2376350:51:+, 2403159:24:-,
## 2409847:58:-, 2412337:25:-, 2527720:22:-, 2616050:12:-, 2721002:10:+,
## 2759681:9:-, 2911224:34:+, 2977139:18:+, 2993628:53:+, 2995958:4:+,
## 3215762:8:-, 3279858:6:-, 3310354:18:+, 3335345:24:+, 3365183:30:-
## 544 loci found with MAF < 0.01 :
## 1337:7:+, 12488:9:+, 22620:9:+, 24788:6:-, 38194:4:-, 42958:8:+,
## 48723:6:-, 65637:19:-, 72032:14:-, 78058:4:+, 84123:35:+, 96838:54:+,
## 121454:31:+, 125541:14:+, 128764:4:-, 134084:4:+, 139964:4:-,
## 140704:26:-, 154460:8:-, 156841:9:-, 172343:14:+, 172469:9:+,
## 175073:9:-, 178277:9:-, 178522:42:-, 184552:18:-, 188755:40:-,
## 196503:21:-, 197938:14:-, 206265:15:-, 231381:16:-, 234597:20:+,
## 235868:8:-, 235915:19:-, 239754:16:-, 249309:9:+, 250369:40:-,
## 255968:12:+, 257692:7:+, 258856:58:-, 259039:15:-, 263627:14:-,
## 265997:4:-, 267729:15:+, 270708:18:+, 272021:12:+, 275577:7:+,
## 278694:48:-, 285495:8:+, 288977:12:-, 297850:9:-, 300051:13:+,
## 307195:13:-, 313001:22:-, 315842:7:-, 330753:6:-, 337482:12:-,
## 341860:15:+, 345195:35:-, 345771:22:+, 356223:7:+, 375697:20:-,
## 390255:6:+, 390364:6:+, 390926:44:+, 395147:48:+, 395275:11:-,
## 405205:44:-, 410405:6:-, 414448:8:-, 419988:25:-, 428684:12:+,
## 430138:15:+, 430560:27:+, 434006:5:+, 436347:21:-, 446766:9:-,
## 448971:4:+, 450854:7:+, 457849:15:-, 461761:11:-, 473135:48:+,
## 473583:8:-, 474080:7:+, 474268:7:-, 479991:37:-, 482412:21:-,
## 491595:21:+, 499882:27:-, 503404:39:+, 506303:93:-, 515279:6:+,
## 535313:5:+, 536630:59:+, 537190:6:+, 541578:30:-, 543838:18:+,
## 548319:16:-, 549402:12:-, 550483:10:-, 550991:10:-, 551262:11:-,
## 554075:11:-, 559828:5:-, 563652:19:+, 566546:8:-, 580261:7:-,
## 604546:9:-, 609753:42:+, 610035:10:+, 614784:8:+, 620834:41:+,
## 625699:49:+, 628118:6:+, 643765:7:-, 649863:35:-, 650712:11:+,
## 656762:10:-, 659001:6:+, 662018:44:-, 669966:15:+, 672261:15:+,
## 679020:33:-, 681542:21:-, 682917:24:+, 687532:4:-, 692659:16:-,
## 696336:10:-, 697912:15:+, 703241:21:-, 707153:40:+, 710910:5:+,
## 718791:5:+, 723997:11:-, 726565:5:+, 727851:36:-, 729657:8:+,
## 732548:39:-, 736870:17:+, 747740:10:-, 750929:17:-, 756107:12:-,
## 759254:12:-, 760998:11:+, 766840:4:+, 772302:13:-, 773928:7:-,
## 774117:73:+, 775659:9:+, 776354:11:+, 777079:11:-, 777443:12:+,
## 778270:36:-, 789617:48:+, 790128:41:+, 811746:5:-, 819100:11:+,
## 819327:17:-, 820465:37:+, 825100:21:-, 831670:6:-, 841628:17:+,
## 867057:6:+, 867132:5:+, 871522:4:+, 880671:43:+, 882527:5:-,
## 898922:27:-, 917205:13:-, 929975:9:-, 959746:36:-, 966142:12:+,
## 977991:13:-, 979187:20:+, 980706:5:-, 994801:9:-, 1001873:30:+,
## 1030276:10:+, 1050635:16:+, 1055307:9:-, 1056117:7:-, 1056467:35:+,
## 1056600:19:+, 1058512:6:-, 1061001:20:-, 1063374:7:+, 1068068:23:+,
## 1068574:24:-, 1075834:12:-, 1080379:59:-, 1084472:9:+, 1086987:13:+,
## 1097298:4:-, 1098638:7:+, 1107901:7:+, 1111519:21:+, 1119478:5:+,
## 1122674:13:-, 1122730:9:-, 1124908:26:-, 1129053:12:+, 1130922:4:+,
## 1131296:66:+, 1136980:93:+, 1142371:4:-, 1172723:6:-, 1175445:10:-,
## 1178051:16:+, 1188064:28:+, 1192965:10:+, 1195684:9:+, 1199902:12:-,
## 1208771:31:+, 1209362:13:+, 1210181:49:-, 1211497:5:+, 1212711:8:-,
## 1228787:13:-, 1250628:26:-, 1253149:20:-, 1253833:15:-, 1272192:9:-,
## 1283874:6:-, 1285858:14:-, 1286143:72:+, 1300043:7:-, 1301567:18:-,
## 1303636:11:-, 1306128:46:-, 1314963:53:-, 1317589:9:+, 1323595:4:-,
## 1325792:42:+, 1338850:14:+, 1339097:4:+, 1347312:10:-, 1347412:84:+,
## 1348720:4:+, 1351091:8:-, 1357125:30:-, 1360182:15:+, 1382467:21:+,
## 1411773:36:+, 1419937:8:-, 1421558:4:-, 1437981:15:-, 1446627:10:-,
## 1449784:9:-, 1466829:12:-, 1476625:9:-, 1485738:44:-, 1495896:20:-,
## 1508415:52:-, 1509075:12:+, 1532928:6:-, 1552355:9:+, 1556155:5:-,
## 1562109:7:-, 1571857:13:-, 1571956:15:-, 1577844:22:-, 1581607:16:-,
## 1593374:47:+, 1596396:10:+, 1604169:51:-, 1615444:6:+, 1632203:4:+,
## 1643540:5:-, 1651269:6:+, 1655211:18:-, 1658500:7:+, 1660299:9:+,
## 1672751:13:-, 1682905:14:+, 1684017:4:-, 1699564:11:-, 1703312:4:+,
## 1705783:44:+, 1717386:21:+, 1718964:40:+, 1721110:14:-, 1726628:5:+,
## 1727756:47:-, 1732000:6:-, 1736572:9:+, 1744321:27:-, 1753241:12:+,
## 1755573:4:+, 1759208:13:-, 1760087:15:-, 1760881:26:+, 1761264:4:+,
## 1762486:23:+, 1762811:10:+, 1764247:40:+, 1776337:4:+, 1781801:12:-,
## 1787267:16:+, 1795737:13:+, 1819292:7:-, 1819932:12:-, 1823783:20:+,
## 1825241:27:+, 1827040:7:-, 1828445:6:-, 1829645:27:+, 1831438:20:+,
## 1832020:36:-, 1842264:5:-, 1843830:20:-, 1844907:9:-, 1857098:14:-,
## 1863976:5:+, 1868751:5:-, 1872817:24:-, 1887103:33:+, 1887223:36:-,
## 1900993:19:-, 1915300:6:-, 1927473:16:+, 1934290:7:-, 1947652:22:+,
## 1947928:12:+, 1947957:22:-, 1948175:7:+, 1948942:19:+, 1951103:10:-,
## 1953940:17:-, 1971803:21:+, 1982183:6:-, 1988081:16:+, 1988241:7:-,
## 2004350:4:+, 2013013:6:-, 2014900:9:-, 2018145:41:+, 2025219:13:-,
## 2028256:15:+, 2057063:27:-, 2061137:26:-, 2062365:5:-, 2068328:8:+,
## 2069780:6:+, 2072688:10:+, 2077659:16:-, 2079198:23:-, 2080321:9:-,
## 2092814:9:-, 2110545:18:+, 2130427:20:-, 2132717:11:+, 2136215:8:-,
## 2138234:4:-, 2144223:20:+, 2157187:6:+, 2160112:18:-, 2169081:24:+,
## 2171959:15:-, 2190514:15:+, 2191387:5:-, 2193033:87:-, 2196461:7:-,
## 2200550:10:+, 2204766:5:+, 2218908:8:-, 2240061:11:+, 2268956:4:+,
## 2271152:48:+, 2277116:4:+, 2277199:15:-, 2277339:14:-, 2284766:49:-,
## 2285075:16:+, 2292014:33:+, 2301146:9:-, 2303391:12:-, 2307601:6:+,
## 2317597:14:+, 2339974:16:+, 2373256:8:+, 2375452:9:+, 2376350:51:+,
## 2377620:27:+, 2380500:17:+, 2389904:20:+, 2398477:23:-, 2403159:24:-,
## 2403355:27:-, 2406620:19:-, 2409847:58:-, 2410868:78:+, 2412337:25:-,
## 2415230:6:-, 2417676:7:+, 2419051:4:-, 2432487:31:-, 2433460:9:-,
## 2437701:6:-, 2439649:12:+, 2443858:5:+, 2465652:10:+, 2480277:21:+,
## 2487006:4:-, 2490212:26:-, 2496499:19:-, 2502726:24:-, 2503228:5:+,
## 2527700:6:-, 2527720:22:-, 2533874:36:-, 2547003:28:-, 2548127:23:-,
## 2559298:24:-, 2569535:11:+, 2581436:8:+, 2610951:6:-, 2614341:4:-,
## 2616050:12:-, 2617481:4:-, 2626826:6:+, 2627276:64:+, 2628664:36:-,
## 2630869:21:-, 2635134:37:-, 2638443:12:-, 2658914:6:+, 2662921:7:+,
## 2663413:25:-, 2680567:33:-, 2696685:16:+, 2700324:16:-, 2715898:20:+,
## 2719634:17:-, 2721002:10:+, 2726124:6:-, 2727000:9:-, 2735386:5:-,
## 2737033:6:+, 2741529:16:+, 2745477:47:-, 2758368:8:-, 2758487:28:-,
## 2759405:34:+, 2759681:9:-, 2766828:49:+, 2766963:59:-, 2768701:6:+,
## 2771383:15:-, 2786644:67:+, 2798593:10:-, 2806142:18:-, 2807313:6:-,
## 2817882:12:-, 2819054:4:+, 2819264:11:+, 2828486:4:+, 2839560:11:+,
## 2839703:76:-, 2841956:19:-, 2857625:34:-, 2857766:12:-, 2857863:7:-,
## 2863789:4:+, 2869983:9:+, 2871028:21:-, 2877535:9:+, 2878330:9:-,
## 2878501:12:+, 2890373:43:-, 2892783:12:+, 2903089:34:-, 2911224:34:+,
## 2911405:7:-, 2935032:8:-, 2935319:19:+, 2948983:13:-, 2967004:42:+,
## 2977139:18:+, 2978975:6:-, 2985944:11:-, 2986306:25:+, 2992673:17:-,
## 2993271:10:-, 2993353:16:+, 2993628:53:+, 2994999:9:+, 2995958:4:+,
## 2997169:5:+, 3018948:37:-, 3044925:41:-, 3045769:22:+, 3056024:62:-,
## 3091935:17:+, 3092204:14:+, 3112923:9:-, 3133732:23:-, 3144698:4:+,
## 3146587:5:+, 3147248:6:-, 3147551:4:+, 3148770:15:+, 3150676:30:-,
## 3153472:10:+, 3161794:29:-, 3171268:18:+, 3190905:30:-, 3194174:65:-,
## 3198359:19:+, 3199407:11:+, 3203478:4:-, 3214530:10:-, 3215594:9:+,
## 3215732:11:+, 3215762:8:-, 3222181:6:-, 3243256:37:-, 3253054:4:+,
## 3254816:28:-, 3269046:9:-, 3273472:13:+, 3277244:22:+, 3279858:6:-,
## 3299566:6:-, 3310354:18:+, 3311474:11:-, 3317853:30:-, 3323224:4:+,
## 3332556:4:-, 3332930:33:-, 3333910:39:+, 3335345:24:+, 3336781:9:+,
## 3340376:11:+, 3355302:5:+, 3361086:6:+, 3365183:30:-, 3370882:9:-,
## 3371435:6:+, 3376350:4:+, 3387137:20:-, 3401822:17:+, 3417690:6:+,
## 3422906:10:+, 3422983:8:+
genpop_data <- genpop_data %>% missingno("loci", cutoff = 0.2) #remove loci with average missing data higher than 20%
##
## No loci with missing values above 20% found.
genpop_data <- genpop_data %>% missingno("geno", cutoff = 0.3) #remove individuals with more than 30% missing genotypes
##
## Found 118494 missing values.
##
## 2 genotypes contained missing values greater than 30%
##
## Removing 2 genotypes: maia_MA_DN2546, maia_MA_DN2549
dataset <- dataset[match(indNames(genpop_data), dataset$ID), ] #match genetic data and dataset
dataset$Species <- droplevels(dataset$Species) #drop unused factors
genpop_data@pop <- factor(dataset$Species) #reassign population slot of genind object to match updated dataset
poppr(genpop_data, quiet = T) #show basic summary table, including number of individuals (N) and genotypes (MLG) per population, Shannon-Wiener index (H; genotype diversity), Stoddart and Taylor's G index (G, genotype diversity) and expected heterozygosity (Hexp) per population
## Pop N MLG eMLG SE H G lambda E.5 Hexp Ia
## 1 H.latifascia 11 11 10 0.00e+00 2.40 11 0.909 1 0 24.5
## 2 H.lucina 9 9 9 0.00e+00 2.20 9 0.889 1 0 34.5
## 3 H.maia 45 45 10 0.00e+00 3.81 45 0.978 1 0 29.4
## 4 H.maia.x.lucina 1 1 1 0.00e+00 0.00 1 0.000 NaN 0 NA
## 5 H.nevadensis 16 16 10 0.00e+00 2.77 16 0.938 1 0 26.3
## 6 H.peigleri 10 10 10 0.00e+00 2.30 10 0.900 1 0 18.1
## 7 H.slosseri 17 17 10 2.31e-07 2.83 17 0.941 1 0 57.1
## 8 H.sp.GreatLakesComplex 10 10 10 0.00e+00 2.30 10 0.900 1 0 24.9
## 9 Total 119 119 10 0.00e+00 4.78 119 0.992 1 0 27.7
## rbarD File
## 1 0.01245 genpop_data
## 2 0.02325 genpop_data
## 3 0.00960 genpop_data
## 4 NA genpop_data
## 5 0.05717 genpop_data
## 6 0.00910 genpop_data
## 7 0.02560 genpop_data
## 8 0.01204 genpop_data
## 9 0.00711 genpop_data
missing_data <- poppr::info_table(genpop_data, type = "missing", plot = T) #plot missing data across loci and populations
round(missing_data[1:length(unique(genpop_data$pop)), 1:4], 2) #show proportion of missing data across populations for first four loci
## Locus
## Population 1344:7:- 1538:93:+ 1604:40:- 1802:5:+
## H.latifascia 0.55 0.27 0.00 0.18
## H.lucina 0.00 0.11 0.33 0.00
## H.maia 0.09 0.13 0.04 0.09
## H.maia.x.lucina 0.00 1.00 0.00 1.00
## H.nevadensis 0.06 0.06 0.00 0.06
## H.peigleri 0.00 0.10 0.00 0.00
## H.slosseri 0.00 0.24 0.06 0.12
## H.sp.GreatLakesComplex 0.00 0.30 0.00 0.10
nLoc(genpop_data) #show number of loci
## [1] 4629
private_alleles(genpop_data) %>% apply(MARGIN = 1, FUN = sum) #show number of private alleles (alleles found exclusively in one population) per site across all loci for each population
## H.latifascia H.lucina H.maia
## 252 124 2327
## H.maia.x.lucina H.nevadensis H.peigleri
## 2 941 93
## H.slosseri H.sp.GreatLakesComplex
## 609 189
plot(summary(genpop_data)$n.by.pop, summary(genpop_data)$pop.n.all,
xlab = "Number of indiduals", ylab = "Number of alleles",
main = "Alleles numbers and sample sizes", type = "n")
text(summary(genpop_data)$n.by.pop, summary(genpop_data)$pop.n.all,
lab = names(summary(genpop_data)$n.by.pop))
barplot(summary(genpop_data)$Hexp - summary(genpop_data)$Hobs,
main = "Heterozygosity: expected-observed", ylab = "Hexp - Hobs")
par(mar = c(13.1, 5.1, 4.1, 2.1)) #adjust plot margins
barplot(summary(genpop_data)$n.by.pop,
main = "Sample sizes per population", ylab = "Number of genotypes", las = 3)
tail(sort(round(summary(genpop_data)$Hobs, 2))) #show loci with highest observed heterozygosity
## 3219076:10:- 1714120:9:+ 769190:9:+ 1079236:33:- 3003808:16:+ 179790:9:-
## 0.44 0.45 0.46 0.46 0.46 0.47
Diversity indices incorporate both genotypic richness and abundance.
round(((poppr(genpop_data))$eMLG), 2) #show genotypic richness accounting for sample size differences (eMLG) via rarefaction
## [1] 10 9 10 1 10 10 10 10 10
round(((poppr(genpop_data))$lambda), 2) #show Simpson's index lambda (Simpson 1949; measure of genotypic diversity: estimation of probability that two randomly selected genotypes are different scaling from 0 with no genotypes are different to 1 so that all genotypes are different)
## [1] 0.91 0.89 0.98 0.00 0.94 0.90 0.94 0.90 0.99
Corr_Simp_ind <- round((((poppr(genpop_data, quiet = T))$N / ((poppr(genpop_data, quiet = T))$N - 1)) * (poppr(genpop_data, quiet = T))$lambda), 2) #calculate sample-size-corrected Simpson's index
data.frame(Population = levels(genpop_data$pop), Value = Corr_Simp_ind[1:length(levels(genpop_data$pop))], stringsAsFactors = F) #print corrected Simpson's index
## Population Value
## 1 H.latifascia 1
## 2 H.lucina 1
## 3 H.maia 1
## 4 H.maia.x.lucina NaN
## 5 H.nevadensis 1
## 6 H.peigleri 1
## 7 H.slosseri 1
## 8 H.sp.GreatLakesComplex 1
Eveness is a measure of distribution of genotype abundances so that a population with equally abundant genotypes yields value equal to 1 while a population dominated by single genotype is closer to zero.
round(((poppr(genpop_data))$E.5), 2) #show evenness E5 (Pielou 1975, Ludwig & Reynolds 1988, GrĂĽnwald et al. 2003)
## [1] 1 1 1 NaN 1 1 1 1 1
A significant p-value indicates lower observed heterozygosity
if ((bartlett.test(list(summary(genpop_data)$Hexp, summary(genpop_data)$Hobs)))$p.value < 0.05) { #test for homogeneity of variances using Bartlett test
var_equal <- T #variances are significantly different
} else {var_equal <- F} #variances are not significantly different
t.test(summary(genpop_data)$Hexp, summary(genpop_data)$Hobs, #perform t-test
paired = T, var.equal = var_equal, alternative = "greater")
##
## Paired t-test
##
## data: summary(genpop_data)$Hexp and summary(genpop_data)$Hobs
## t = 50.54, df = 4628, p-value < 2.2e-16
## alternative hypothesis: true mean difference is greater than 0
## 95 percent confidence interval:
## 0.03905451 Inf
## sample estimates:
## mean difference
## 0.0403686
Ignore any warnings that may come up (not shown in output here). Export figures with significantly higher width to achieve higher resolution and interpretability (they appear small here due to their large width of figure).
hwe_results <- lapply(seppop(genpop_data), hw.test, B = 0) #ignore any warnings showing up
hwe_pvalues_df <- melt(sapply(hwe_results, "[", i = T, j = 3)) #extract p-values from each population's test results and convert matrix to data frame for ggplot
colnames(hwe_pvalues_df) <- c("Locus", "Population", "P_value")
hwe_pvalues_df$P_value_adj <- p.adjust(hwe_pvalues_df$P_value, method = "fdr") #perform FDR correction
hwe_pvalues_df$Significant <- hwe_pvalues_df$P_value_adj < 0.05 #create significance column based on adjusted p-values
ggplot(hwe_pvalues_df, aes(x = Locus, y = Population, fill = P_value)) + #plot full heatmap of P-values
geom_tile(color = "white") +
scale_fill_viridis_c(option = "mako", name = "P-value", direction = -1) +
labs(x = "Locus", y = "Population",
title = "HWE p-values across loci and populations") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.text.y = element_text(size = 8),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.margin = margin(0, 0, 0, 0))
ggplot(hwe_pvalues_df, aes(x = Locus, y = Population)) + #plot significant p-values
geom_tile(aes(fill = Significant), color = "white") + #highlight only significant p-values
scale_fill_manual(values = c("white", "red"), name = "Significant") + #grey for non-significant, red for significant
labs(x = "Locus", y = "Population",
title = "Significant HWE p-values (FDR corrected)") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.text.y = element_text(size = 8),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.margin = margin(0, 0, 0, 0))
calc.heterozygosity <- function(genind_data) { #create function to calculate gene diversity within each population
if (!inherits(genind_data, "genind")) {
stop("Input must be a genind object")}
populations <- levels(pop(genind_data))
observed_heterozygosity <- numeric(length(populations))
expected_heterozygosity <- numeric(length(populations))
for (i in seq_along(populations)) {
pop <- populations[i]
pop_data <- genind_data[pop(genind_data) == pop] #subset genind object by population
if (nInd(pop_data) == 0) {
warning(paste("No data for population:", pop))
next}
tryCatch({
summary_stats <- summary(pop_data) #check if Hobs and Hexp are present before computing means
if (!is.null(summary_stats$Hobs)) {
observed_heterozygosity[i] <- mean(summary_stats$Hobs, na.rm = T)}
if (!is.null(summary_stats$Hexp)) {
expected_heterozygosity[i] <- mean(summary_stats$Hexp, na.rm = T)}
}, error = function(e) {
warning(paste("Error processing population:", pop, "Details:", e$message))})}
heterozygosity_df <- data.frame(
Population = populations,
Observed = round(observed_heterozygosity, 2),
Expected = round(expected_heterozygosity, 2))
print(heterozygosity_df) #print results
return(heterozygosity_df)} #return heterozygosity data frame
plot_heterozygosity <- function(heterozygosity_df) { #create function for plotting
heterozygosity_long <- reshape2::melt(heterozygosity_df, id.vars = "Population",
variable.name = "Type", value.name = "Heterozygosity") #melt data frame for plotting
ggplot(heterozygosity_long, aes(x = Population, y = Heterozygosity, fill = Type)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
labs(x = "Population", y = "Heterozygosity", fill = "Type") +
theme_classic() +
scale_fill_manual(values = c("Observed" = "darkblue", "Expected" = "lightblue"))} #set fill colors
heterozygosity_df <- calc.heterozygosity(genpop_data) #calculate heterozygosity
## Population Observed Expected
## 1 H.latifascia 0.11 0.12
## 2 H.lucina 0.09 0.10
## 3 H.maia 0.10 0.13
## 4 H.maia.x.lucina 0.10 0.28
## 5 H.nevadensis 0.02 0.04
## 6 H.peigleri 0.12 0.12
## 7 H.slosseri 0.11 0.13
## 8 H.sp.GreatLakesComplex 0.11 0.13
plot_heterozygosity(heterozygosity_df) #plot heterozygosity
genpop_data_loci <- pegas::Fst(pegas::as.loci(genpop_data))
head(round(genpop_data_loci, 2)) #per-locus F-statistics
## Fit Fst Fis
## 1344:7:- 0.41 0.18 0.28
## 1538:93:+ 0.62 0.38 0.38
## 1604:40:- 0.61 0.29 0.45
## 1802:5:+ 0.00 0.06 -0.06
## 2448:32:- -0.01 0.00 -0.02
## 3144:28:+ 0.31 0.26 0.06
round(colMeans(genpop_data_loci[, !is.na(colMeans(genpop_data_loci, na.rm = T)) & !is.nan(colMeans(genpop_data_loci, na.rm = T))], na.rm = T), 2) #global F-statistics (take into account all genotypes and loci but ignore loci with NA)
## Fit Fst Fis
## 0.27 0.10 0.20
round(boot.vc(genind2hierfstat(genpop_data)[1], genind2hierfstat(genpop_data)[-1], #calculate CIs of F-statistics (H-Total: total expected heterozygosity, F-pop/Total = Fst, F-Ind/Total = Fit, H-pop: expected heterozygosity within populations, F-Ind/pop: Fis, Hobs: observed heterozygosity) ot = 100)$ci, 2)
nboot = 300)$ci, 2) #specify number of bootstraps (increase for proper analysis)
## H-Total F-pop/Total F-Ind/Total H-pop F-Ind/pop Hobs
## 2.5% 0.14 0.15 0.32 0.12 0.20 0.09
## 50% 0.14 0.15 0.33 0.12 0.20 0.09
## 97.5% 0.14 0.16 0.33 0.12 0.21 0.10
Create function to compute pairwise genetic differentiation among populations and plot results as heatmap
calc.and.plot.pairwise.gen.dist.mat <- function(pop_dataset, #genind or genlight object with “pop” factor for each individual
genetic_diff_measure = "Fst", #one of "D_Jost", "Gst_Hedrick", "Fst", or "Ds_Nei".
viridis_col = "mako", #name of Viridis palette (e.g., "mako", "viridis", magma etc.)
nboots_Fst = 1000, #number of bootstraps (only for Fst)
star_col_Fst = "#FFA500", #color for significance star in plot (only for Fst)
signif_stars_Fst_plot = F, #add significance stars (for significant p-values after Bonferroni correction) in plot (only for Fst)
min_n_per_pop = 5) { #minimum number of individuals required in each population so that populations with fewer than this threshold will be dropped
## Check sample sizes
pop_sizes <- table(adegenet::pop(pop_dataset)) #tabulate number of individuals per population
small_pops <- names(pop_sizes[pop_sizes < min_n_per_pop]) #identify populations with fewer than min_n_per_pop individuals
if (length(small_pops) > 0) {
cat("The following populations have fewer than", min_n_per_pop, "individuals and were removed for analysis:",
paste(small_pops, collapse = ", "), "\n")
pop_dataset <- pop_dataset[!adegenet::pop(pop_dataset) %in% small_pops, ] #drop these populations
}
## Compute genetic distance
distance_matrix <- switch(genetic_diff_measure,
## Jost’s D (2008) using mmod::pairwise_D
"D_Jost" = as.matrix(mmod::pairwise_D(pop_dataset, linearized = FALSE)),
## Hedrick’s G’ST (2005) using mmod::pairwise_Gst_Hedrick
"Gst_Hedrick" = as.matrix(mmod::pairwise_Gst_Hedrick(pop_dataset, linearized = FALSE)),
## Weir & Cockerham’s FST (1984) using StAMPP
"Fst" = {genind_obj <- StAMPP::stamppConvert(dartR::gi2gl(pop_dataset, verbose = 0), type = "genlight") #convert pop_dataset to genlight and then genind
fst_results <- StAMPP::stamppFst(genind_obj, nboots = nboots_Fst, percent = 95, nclusters = 1) #compute FST with bootstrapping for confidence intervals
list(values = as.matrix(fst_results$Fsts), p_values = as.matrix(fst_results$Pvalues))}, #extract FST matrix and p-values matrix
## Nei’s standard genetic distance Ds (1972) using genet.dist
"Ds_Nei" = as.matrix(genet.dist(pop_dataset, method = "Ds")),
## Print error if unsupported measure is supplied
stop("Unsupported genetic differentiation measure. Choose from: ", "'D_Jost', 'Gst_Hedrick', 'Fst', 'Ds_Nei'."))
## Extract Fst and p-values
if (genetic_diff_measure == "Fst") {
fst_values <- distance_matrix$values
p_values <- distance_matrix$p_values
cat("p-values for Fst:\n")
print(p_values)
} else {
fst_values <- distance_matrix
p_values <- NULL
}
## Replace negative distances with zero
fst_values[fst_values < 0] <- 0
## Prepare matrix for plotting
## Labels for legend
label_list <- c("D_Jost" = "Jost’s D", "Gst_Hedrick" = "G’", "Fst" = "F", "Ds_Nei" = "G")
label <- label_list[[genetic_diff_measure]]
## For measures that report "G_ST" or "F_ST"
label_lowercase <- ifelse(genetic_diff_measure %in% c("Gst_Hedrick", "Fst", "Ds_Nei"), "ST", "")
## Print labels and Fst values
cat(paste(label, "values:\n"))
print(round(fst_values, 2))
## Create matrix that keeps only lower triangle
distance_matrix_lower <- fst_values
distance_matrix_lower[!lower.tri(distance_matrix_lower, diag = FALSE)] <- NA
## Melt to long format for ggplot
distance_matrix_long <- reshape2::melt(distance_matrix_lower, na.rm = TRUE)
colnames(distance_matrix_long) <- c("Row", "Column", "Distance")
## Add p-values and significance stars for Fst
if (!is.null(p_values)) {
## Mask upper triangle of p-values matrix
p_values_lower <- p_values
p_values_lower[!lower.tri(distance_matrix_lower, diag = FALSE)] <- NA
## Melt p-values to long format
p_values_long <- reshape2::melt(p_values_lower, na.rm = TRUE)
colnames(p_values_long) <- c("Row", "Column", "P_value")
## Merge distances with p-values
distance_matrix_long <- merge(distance_matrix_long, p_values_long, by = c("Row", "Column"))
## Bonferroni correction: α = 0.05 / number_of_comparisons
n_comparisons <- nrow(distance_matrix_long)
bonferroni_threshold <- 0.05 / n_comparisons
## Significance star if P ≤ threshold
distance_matrix_long$Significance <- ifelse(distance_matrix_long$P_value <= bonferroni_threshold, "*", "")
}
## Plot heatmap
## Include significance stars on tiles for Fst
if (genetic_diff_measure == "Fst") {
if (signif_stars_Fst_plot) {
plot <- ggplot2::ggplot(distance_matrix_long, aes(x = Column, y = Row, fill = Distance)) +
geom_tile() +
geom_text(aes(label = Significance), size = 7, color = star_col_Fst, na.rm = TRUE) +
scale_fill_viridis_c(option = viridis_col, direction = -1) +
theme_classic() +
labs(x = "Population", y = "Population", fill = bquote(.(label)[.(label_lowercase)]))
} else {
plot <- ggplot2::ggplot(distance_matrix_long, aes(x = Column, y = Row, fill = Distance)) +
geom_tile() +
scale_fill_viridis_c(option = viridis_col, direction = -1) +
theme_classic() +
labs(x = "Population", y = "Population", fill = bquote(.(label)[.(label_lowercase)]))
}
} else {
## Non-FST measures
plot <- ggplot2::ggplot(distance_matrix_long, aes(x = Column, y = Row, fill = Distance)) +
geom_tile() +
scale_fill_viridis_c(option = viridis_col, direction = -1) +
theme_classic() +
labs(x = "Population", y = "Population", fill = bquote(.(label)[.(label_lowercase)]))}
## Print heatmap
print(plot)
}
Ignore any warnings that may come up (not shown in output here)
calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "Fst", nboots_Fst = 1000, signif_stars_Fst_plot = F) #compute and plot Fst (Weir and Cockerham 1984) with bootstrapping for Bonferroni corrected p-values
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Registered S3 method overwritten by 'genetics':
## method from
## [.haplotype pegas
## Starting gl.compliance.check
## Processing genlight object with SNP data
## The slot loc.all, which stores allele name for each locus, is empty.
## Creating a dummy variable (A/C) to insert in this slot.
## Checking coding of SNPs
## SNP data scored NA, 0, 1 or 2 confirmed
## Checking locus metrics and flags
## Recalculating locus metrics
## Checking for monomorphic loci
## Dataset contains monomorphic loci
## Checking for loci with all missing data
## No loci with all missing data detected
## Checking whether individual names are unique.
## Checking for individual metrics
## Warning: Creating a slot for individual metrics
## Checking for population assignments
## Population assignments confirmed
## Spelling of coordinates checked and changed if necessary to
## lat/lon
## Completed: gl.compliance.check
## p-values for Fst:
## H.maia H.latifascia H.sp.GreatLakesComplex H.lucina
## H.maia NA NA NA NA
## H.latifascia 0 NA NA NA
## H.sp.GreatLakesComplex 0 0 NA NA
## H.lucina 0 0 0 NA
## H.nevadensis 0 0 0 0
## H.slosseri 0 0 0 0
## H.peigleri 0 0 0 0
## H.nevadensis H.slosseri H.peigleri
## H.maia NA NA NA
## H.latifascia NA NA NA
## H.sp.GreatLakesComplex NA NA NA
## H.lucina NA NA NA
## H.nevadensis NA NA NA
## H.slosseri 0 NA NA
## H.peigleri 0 0 NA
## F values:
## H.maia H.latifascia H.sp.GreatLakesComplex H.lucina
## H.maia NA NA NA NA
## H.latifascia 0.06 NA NA NA
## H.sp.GreatLakesComplex 0.04 0.05 NA NA
## H.lucina 0.13 0.18 0.14 NA
## H.nevadensis 0.27 0.35 0.37 0.49
## H.slosseri 0.09 0.09 0.10 0.21
## H.peigleri 0.08 0.07 0.08 0.20
## H.nevadensis H.slosseri H.peigleri
## H.maia NA NA NA
## H.latifascia NA NA NA
## H.sp.GreatLakesComplex NA NA NA
## H.lucina NA NA NA
## H.nevadensis NA NA NA
## H.slosseri 0.32 NA NA
## H.peigleri 0.37 0.06 NA
calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "Gst_Hedrick") #compute and plot Hedrick's G'st (Hedrick 2005)
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## G’ values:
## H.latifascia H.lucina H.maia H.nevadensis H.peigleri
## H.latifascia 0.00 0.22 0.07 0.36 0.09
## H.lucina 0.22 0.00 0.17 0.49 0.24
## H.maia 0.07 0.17 0.00 0.35 0.10
## H.nevadensis 0.36 0.49 0.35 0.00 0.38
## H.peigleri 0.09 0.24 0.10 0.38 0.00
## H.slosseri 0.11 0.25 0.11 0.36 0.07
## H.sp.GreatLakesComplex 0.07 0.18 0.06 0.37 0.10
## H.slosseri H.sp.GreatLakesComplex
## H.latifascia 0.11 0.07
## H.lucina 0.25 0.18
## H.maia 0.11 0.06
## H.nevadensis 0.36 0.37
## H.peigleri 0.07 0.10
## H.slosseri 0.00 0.12
## H.sp.GreatLakesComplex 0.12 0.00
calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "Ds_Nei") #compute and plot Nei's standard genetic distance Ds (Nei 1972)
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## G values:
## H.latifascia H.lucina H.maia H.nevadensis H.peigleri
## H.latifascia 0.00 0.04 0.02 0.05 0.02
## H.lucina 0.04 0.00 0.03 0.07 0.05
## H.maia 0.02 0.03 0.00 0.05 0.02
## H.nevadensis 0.05 0.07 0.05 0.00 0.05
## H.peigleri 0.02 0.05 0.02 0.05 0.00
## H.slosseri 0.02 0.05 0.02 0.05 0.02
## H.sp.GreatLakesComplex 0.02 0.03 0.01 0.05 0.02
## H.slosseri H.sp.GreatLakesComplex
## H.latifascia 0.02 0.02
## H.lucina 0.05 0.03
## H.maia 0.02 0.01
## H.nevadensis 0.05 0.05
## H.peigleri 0.02 0.02
## H.slosseri 0.00 0.02
## H.sp.GreatLakesComplex 0.02 0.00
calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "D_Jost") #compute and plot Jost’s D (2008)
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## Jost’s D values:
## H.latifascia H.lucina H.maia H.nevadensis H.peigleri
## H.latifascia 0.00 0.03 0.01 0.04 0.01
## H.lucina 0.03 0.00 0.02 0.07 0.04
## H.maia 0.01 0.02 0.00 0.04 0.01
## H.nevadensis 0.04 0.07 0.04 0.00 0.05
## H.peigleri 0.01 0.04 0.01 0.05 0.00
## H.slosseri 0.02 0.04 0.02 0.05 0.01
## H.sp.GreatLakesComplex 0.01 0.03 0.01 0.05 0.01
## H.slosseri H.sp.GreatLakesComplex
## H.latifascia 0.02 0.01
## H.lucina 0.04 0.03
## H.maia 0.02 0.01
## H.nevadensis 0.05 0.05
## H.peigleri 0.01 0.01
## H.slosseri 0.00 0.02
## H.sp.GreatLakesComplex 0.02 0.00
Inbreeding represents the excess of homozygosity in an individual due to inheriting two identical alleles from recent common ancestor. It can lead to inbreeding depression so the associated loss of fitness, often due to recessive deleterious alleles that are more frequent than in the population. Ignore any warnings that may come up (not shown in output here)
genpop_data_inbreeding_mean <- sapply(inbreeding(genpop_data, N = 1000, M = 2000), mean) #compute mean inbreeding values for each individual (mean from the likelihood distribution of each individual)
hist(genpop_data_inbreeding_mean, main = "Average inbreeding in individuals") #plot distribution of (mean) inbreeding values for each individual
round(tail(sort(genpop_data_inbreeding_mean), n = 10), 2) #list ten individuals with highest mean inbreeding
## bog_NY_DN2502 TX_Andrews_Hmg026 TX_Lampasas_Hmg080
## 0.51 0.51 0.51
## NM_Escondido_Hmg195 maia_MA_DN2543 nev_CA_DN2395
## 0.51 0.52 0.52
## MN_CarlosAverySWA_Hmg324 WI_RomePond1_Hmg017 nev_NV_DR99
## 0.52 0.52 0.52
## maia_MA_DN2379
## 0.52
genpop_data_inbreeding_df <- data.frame(ID = names(genpop_data_inbreeding_mean), Inbreeding_Mean = genpop_data_inbreeding_mean, Population = genpop_data$pop)
ggplot(genpop_data_inbreeding_df, aes(x = Inbreeding_Mean)) + #plot distribution of (mean) inbreeding values for each individual for each population
facet_wrap(~ Population, scales = "free_y") +
geom_histogram(binwidth = 0.005, fill = "lightblue", color = "black", alpha = 0.8) +
labs(x = "Individual mean inbreeding value", y = "Frequency") +
theme_classic() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
genpop_data_inbreeding_df %>% group_by(Population) %>% top_n(5, Inbreeding_Mean) %>% arrange(Population, desc(Inbreeding_Mean)) %>% mutate(Inbreeding_Mean = round(Inbreeding_Mean, 2)) #list ten individuals with highest mean inbreeding
## # A tibble: 36 Ă— 3
## # Groups: Population [8]
## ID Inbreeding_Mean Population
## <chr> <dbl> <fct>
## 1 MN_CarlosAverySWA_Hmg324 0.52 H.latifascia
## 2 maia_IA_DN2517 0.51 H.latifascia
## 3 maia_IA_DN2480 0.51 H.latifascia
## 4 MN_CarlosAverySWA_Hmg327 0.5 H.latifascia
## 5 NE_Mullen_Hmg042 0.5 H.latifascia
## 6 MA_Montague_Hmg044 0.51 H.lucina
## 7 luc_CT_DN2565 0.5 H.lucina
## 8 luc_CT_DN2563 0.5 H.lucina
## 9 NH_CheshireCo_Hmg130 0.5 H.lucina
## 10 NH_CheshireCo_Hmg129 0.5 H.lucina
## # ℹ 26 more rows
Using corrected index of association rd (Agapow & Burt 2001) to estimate linkage between loci for each population (less biased version of original index of association by Brown et al. (1980) that accounts for number of sampled loci; it uses permutation test with null hypothesis of unlinked loci). r̄d near 0: very low linkage disequilibrium (loci mostly assort independently - random mating), r̄d 0.01-0.05: Moderate linkage disequilibrium (some degree of non-random association), r̄d > 0.05: Relatively high linkage disequilibrium (can indicate strong clonality, strong population structure, or selection); but numbers are relative so that they need to be compared between populations!
results_list <- list() #initialize empty list to store results for each population
## Loop over each population level in dataset
for (pop in levels(pop(genpop_data))) {
## Subset data for population
pop_data <- genpop_data[pop(genpop_data) == pop, , drop = TRUE]
n <- nInd(pop_data)
## Skip if fewer than 5 individuals
if (n < 5) {results_list[[pop]] <- c(rbarD = NA, p.value = NA); next}
## Calculate standardized index of association (rbarD) with permutations
ia_result <- poppr::ia(pop_data, sample = 100, index = "rbarD")
## Store rbarD and p-value
results_list[[pop]] <- c(rbarD = ia_result["rbarD"], p.value = ia_result["p.rD"])
}
## Combine results, convert to data frame and show results
genpop_data_LD <- do.call(rbind, results_list)
colnames(genpop_data_LD) <- c("rbarD", "p.value")
genpop_data_LD <- data.frame(Population = rownames(genpop_data_LD), rd = as.numeric(genpop_data_LD[, "rbarD"]),
p.value = as.numeric(genpop_data_LD[, "p.value"]), row.names = NULL)
genpop_data_LD
## Population rd p.value
## 1 H.latifascia 0.012451220 0.00990099
## 2 H.lucina 0.023248025 0.00990099
## 3 H.maia 0.009596439 0.00990099
## 4 H.maia.x.lucina NA NA
## 5 H.nevadensis 0.057168271 0.00990099
## 6 H.peigleri 0.009097995 0.00990099
## 7 H.slosseri 0.025596253 0.00990099
## 8 H.sp.GreatLakesComplex 0.012041593 0.00990099
genetic_dist <- dist(sqrt(adegenet::tab(genpop_data, freq = T)), method = "euclidean")
geo_dist <- geosphere::distm(as.matrix(dataset[, c("Longitude", "Latitude")]), fun = geosphere::distVincentySphere)
mantel_test <- ade4::mantel.randtest(as.dist(genetic_dist), as.dist(geo_dist), nrepet = 1000)
mantel_test #print Mantel test results
## Monte-Carlo test
## Call: ade4::mantel.randtest(m1 = as.dist(genetic_dist), m2 = as.dist(geo_dist),
## nrepet = 1000)
##
## Observation: 0.3677038
##
## Based on 1000 replicates
## Simulated p-value: 0.000999001
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 20.0946833614 -0.0003455084 0.0003354669
plot(mantel_test) #plot results: black dot indicates observed correlation and histograms show permuted values
combined_data <- data.frame(genetic_dist = reshape2::melt(as.matrix(genetic_dist))$value, #combine genetic and geographic distance matrices into one data frame
geo_dist = reshape2::melt(as.matrix(geo_dist))$value)
combined_data <- combined_data[combined_data$genetic_dist != 0 & combined_data$geo_dist != 0, ] #remove self-comparisons where distance is 0
svg("IBD across all individuals.svg", width = 25/2.56, height = 15/2.56)
ggplot(combined_data, aes(x = geo_dist / 1000, y = genetic_dist)) + #plot IBD
geom_point(alpha = 0.2, color = "black", size = 1.6) + #scatter plot with points
geom_smooth(method = "lm", color = "orange", linewidth = 1.8) + #linear model fit
labs(x = "Geographic distance (km)", y = "Genetic distance (Edward's)", title = "Isolation-by-distance") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5), #center plot title
axis.text = element_text(color = "black")) #color axis text
dev.off()
## png
## 2
population_subsets <- list() #define list to store subsets for each population
populations <- unique(genpop_data@pop) #extract unique populations from genind object
for (pop in populations) { #create subsets for each population
pop_genind_subset <- genpop_data[genpop_data@pop == pop, ] #subset genind for population
pop_coords_subset <- dataset[dataset$Species == pop, ] #subset coordinates for population
population_subsets[[pop]] <- list(genind_obj = pop_genind_subset, coords_df = pop_coords_subset)} #store subsets in list
calc.dist.mat.and.run.Mantel.test <- function(genind_obj, coords_df, N_Mantel = 10000, min_ind = 5) {
if (nrow(coords_df) < min_ind) { #check if population has more than 3 individuals
cat("Skipping population", unique(genind_obj@pop), "due to fewer than", min_ind, "individuals.\n")
return(NULL)} #skip populations with 3 or fewer individuals
genetic_dist <- tryCatch({ #calculate genetic distance (Edwards' distance)
dist(sqrt(adegenet::tab(genind_obj, freq = T)), method = "euclidean")
}, error = function(e) {cat("Error in genetic distance calculation:", e$message, "\n")
return(NULL)})
geo_dist <- tryCatch({ #calculate geographic distance (Vincenty distance)
geosphere::distm(as.matrix(coords_df[, c("Longitude", "Latitude")]), fun = geosphere::distVincentySphere)
}, error = function(e) {cat("Error in geographic distance calculation:", e$message, "\n")
return(NULL)})
mantel_test <- NULL #perform Mantel test if both distance matrices are available
if (!is.null(genetic_dist) && !is.null(geo_dist)) {
mantel_test <- tryCatch({
ade4::mantel.randtest(as.dist(genetic_dist), as.dist(geo_dist), nrepet = N_Mantel)
}, error = function(e) {cat("Error in Mantel test:", e$message, "\n")
return(NULL)})
} else {cat("Skipping Mantel test due to NULL distance matrices.\n")}
plot_data <- NULL #prepare plot data and generate plot if distance matrices are available
combined_data <- NULL #initialize combined_data to store genetic and geographic distances
if (!is.null(genetic_dist) && !is.null(geo_dist)) {
genetic_dist_matrix <- as.matrix(genetic_dist)
geo_dist_matrix <- as.matrix(geo_dist)
combined_data <- data.frame( #combine matrices for plotting
genetic_dist = reshape2::melt(genetic_dist_matrix)$value,
geo_dist = reshape2::melt(geo_dist_matrix)$value,
Population = unique(genind_obj@pop)) #add population information
combined_data <- combined_data[combined_data$genetic_dist != 0 & combined_data$geo_dist != 0, ] #remove self-comparisons
mantel_p_value <- ifelse(is.null(mantel_test), NA, mantel_test$pvalue) #get Mantel p-value
plot_title <- paste("Isolation-by-distance for", unique(genind_obj@pop), "\nMantel test p-value:", format(mantel_p_value, digits = 3)) #title with p-value
plot_data <- ggplot(combined_data, aes(x = geo_dist / 1000, y = genetic_dist)) + #plot genetic vs geographic distance
geom_point(alpha = 0.5, color = "black", size = 1.8) + #scatter plot with points
geom_smooth(method = "loess", color = "darkgrey", linewidth = 0.9) + #LOESS smooth fit
geom_smooth(method = "lm", color = "orange", linewidth = 1.5) + #linear model fit
labs(x = "Geographic distance (km)", y = "Genetic distance (Edwards')", title = plot_title) + #include plot title with p-value
theme_classic() +
theme(plot.title = element_text(hjust = 0.5), #center plot title
axis.text = element_text(color = "black")) #color axis text
} else {cat("Skipping plot due to missing distance matrices.\n")}
return(list(mantel_test = mantel_test, plot_data = plot_data, combined_data = combined_data, num_individuals = nrow(coords_df)))} #return combined_data
results_list <- list() #initialize results list to store Mantel test results and plots
results_df <- data.frame(Population = character(), Num_Individuals = integer(), #initialize dataframe to store results
Mantel_p_value = numeric(), stringsAsFactors = F)
all_data <- data.frame() #initialize dataframe to store combined data for all populations
Ignore any warnings that may come up (not shown in output here)
results_list <- list() #initialize results list to store Mantel test results and plots
results_df <- data.frame(Population = character(), Num_Individuals = integer(), #initialize dataframe to store results
Mantel_p_value = numeric(), stringsAsFactors = F)
all_data <- data.frame() #initialize dataframe to store combined data for all populations
for (pop in names(population_subsets)) {
cat("Analyzing population:", pop, "\n")
pop_data <- population_subsets[[pop]]$genind_obj #extract genind object for population
pop_coords <- population_subsets[[pop]]$coords_df #extract coordinates dataframe for population
results <- calc.dist.mat.and.run.Mantel.test(genind_obj = pop_data, coords_df = pop_coords) #analyze population
if (!is.null(results)) { #store results if available
results_list[[pop]] <- results
results_df <- rbind(results_df, data.frame(
Population = pop,
Num_Individuals = results$num_individuals,
Mantel_obs = ifelse(is.null(results$mantel_test), NA, results$mantel_test$obs),
Mantel_p_value = ifelse(is.null(results$mantel_test), NA, results$mantel_test$pvalue),
stringsAsFactors = FALSE))
if (!is.null(results$combined_data)) {
all_data <- rbind(all_data, results$combined_data)} #append combined data for each population
if (!is.null(results$plot_data)) { #save plot if available
print(results$plot_data)}}}
## Analyzing population: H.maia
## Analyzing population: H.latifascia
## Analyzing population: H.sp.GreatLakesComplex
## Analyzing population: H.lucina
## Analyzing population: H.maia.x.lucina
## Skipping population 1 due to fewer than 5 individuals.
## Analyzing population: H.nevadensis
## Analyzing population: H.slosseri
## Analyzing population: H.peigleri
write.csv(results_df, "Mantel_test_results.csv", row.names = F)
Genetic clusters are inferred through eigenvector decomposition of the allele frequencies (matrices) among individuals, aiming to reduce the complexity of genetic data by projecting it into a lower-dimensional space with each axis (principal component) capturing a portion of the variance in the allele frequencies (Patterson et al. 2006, Reich et al. 2008).
cols_pop <- viridis::magma(n = length(levels(population_assignment)), begin = 1, end = 0) #define color palette
shapes_pop <- rep(c(21, 24, 22, 25), length.out = length(cols_pop)) #define shape palette
plot_width <- 25/2.56
plot_height <- 15/2.56
point_size <- 4.1
point_alpha <- 0.9
point_outline_color <- "black"
point_alpha_3D_plot <- 1
point_size_3D_plot <- 7
population_assignment_name <- "Species"
pop_data_datasets <- list(genpop_data = list(data = genpop_data))
handle.NA.inf.values <- function(data) {
data$tab <- as.matrix(data$tab)
data$tab <- data$tab[, colSums(is.na(data$tab) | is.infinite(data$tab)) < nrow(data$tab)]
data$tab <- apply(data$tab, 2, function(column) {
column_mean <- mean(column, na.rm = T)
column[is.na(column) | is.infinite(column)] <- column_mean
return(column)})
return(data)}
run.pca <- function(data) {
data <- handle.NA.inf.values(data)
prcomp(data$tab)}
calc.variance.explained <- function(pca_result) {
(pca_result$sdev^2 / sum(pca_result$sdev^2)) * 100}
calc.N.pcs.above.threshold <- function(var_expl_perc, threshold = 1) {
length(which(var_expl_perc >= threshold))}
create.scree.plot.df <- function(var_expl_perc) {
data.frame(PC = seq_along(var_expl_perc), VarianceExplained = var_expl_perc)}
create.pca.df <- function(pca_result, group) {
num_pcs <- ncol(pca_result$x) # total number of PCs
num_pcs_to_keep <- ceiling(num_pcs * 0.5) #keep 50% of total PCs
data.frame(pca_result$x[, 1:num_pcs_to_keep], group = group)}
plot.scree.plot <- function(scree_plot_df, threshold) {
num_pcs <- nrow(scree_plot_df) #total number of PCs
num_pcs_to_keep <- ceiling(num_pcs * 0.5) #keep 50% of total PCs
ggplot(scree_plot_df[1:num_pcs_to_keep, ], aes(x = PC, y = VarianceExplained)) +
geom_bar(stat = "identity", fill = "grey") +
geom_line(aes(y = cumsum(VarianceExplained)), color = "orange") +
geom_point(aes(y = cumsum(VarianceExplained)), color = "orange", size = 2) +
labs(x = "PC", y = "Explained variance (%)") +
geom_hline(yintercept = threshold, color = "black", linetype = "dashed") +
theme_classic()}
run.pca.for.datasets <- function(datasets) {
lapply(datasets, function(subset) {
pca_result <- run.pca(subset$data)
var_expl_perc <- calc.variance.explained(pca_result)
num_pcs <- calc.N.pcs.above.threshold(var_expl_perc)
scree_plot_df <- create.scree.plot.df(var_expl_perc)
print(plot.scree.plot(scree_plot_df, threshold = 1))
list(pca_result = pca_result, variance_explained = var_expl_perc, num_pcs_above_1perc = num_pcs)})}
pca_results <- run.pca.for.datasets(pop_data_datasets)
pca_df_genpop_data <- create.pca.df(pca_results$genpop_data$pca_result,
factor(genpop_data$pop, levels = unique(population_assignment)))
svg("PCA_PC1_PC2_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(pca_df_genpop_data, aes(x = PC1, y = PC2)) + #PCA scatterplot of PC1 and PC2
geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group),
color = point_outline_color) +
scale_shape_manual(values = shapes_pop) +
scale_fill_manual(values = cols_pop) +
labs(x = sprintf("PC1 (%.2f%%)", pca_results$genpop_data$variance_explained[1]),
y = sprintf("PC2 (%.2f%%)", pca_results$genpop_data$variance_explained[2]),
shape = population_assignment_name,
fill = population_assignment_name) +
theme_classic() +
guides(override.aes = list(alpha = 1))
dev.off()
svg("PCA_PC1_PC3_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(pca_df_genpop_data, aes(x = PC1, y = PC3)) + #PCA scatterplot of PC1 and PC3
geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group),
color = point_outline_color) +
scale_shape_manual(values = shapes_pop) +
scale_fill_manual(values = cols_pop) +
labs(x = sprintf("PC1 (%.2f%%)", pca_results$genpop_data$variance_explained[1]),
y = sprintf("PC3 (%.2f%%)", pca_results$genpop_data$variance_explained[2]),
shape = population_assignment_name,
fill = population_assignment_name) +
theme_classic() +
guides(override.aes = list(alpha = 1))
dev.off()
plotly::plot_ly( #3D plot of PCA using PC1-PC3
data = pca_df_genpop_data,
x = ~PC1,
y = ~PC2,
z = ~PC3,
color = ~group, colors = cols_pop,
type = 'scatter3d', mode = 'markers',
alpha = point_alpha_3D_plot, #marker transparency
marker = list(size = point_size_3D_plot)) #marker size
Overview
Discriminant analysis of principal components (DAPC) is a multivariate
method designed to identify and describe genetic clusters by determining
the optimal number of clusters (k) that best summarizes the data. It is
faster than traditional Bayesian clustering methods and does not assume
Hardy-Weinberg Equilibrium (HWE) or linkage between loci. It maximizes
variation between groups while minimizing variation within them,
offering a distinct advantage over methods like PCA, PCoA, and MANOVA
(which focus on global diversity but neglect group-specific
differences). Furthermore, it is potentially more powerful and accurate
in phylogenetic and hierarchical analyses (including hybridization) than
methods that minimize HWE and linkage disequilibrium (e.g., STRUCTURE)
(Kanno et al 2011, Dupuis and Sperling 2015, Jombart et al. 2010).
Approach
DAPC first transforms the data through principal component analysis
(PCA) to ensure the variables submitted to discriminant analysis (DA)
are uncorrelated and that their number is less than the number of
analyzed individuals (in contrast to a standard DA), and then then
applies DA to the retained principal components, aiming to discriminate
between predefined groups. The eigenvalues in a DAPC represent the ratio
of variance between groups over the variance within groups for each
discriminant function, indicating the discriminant power of each axis in
a DAPC plot.
How many PCs to retain?
Choosing the optimal number of retained principal components (PCs) is
crucial for the effectiveness of DAPC. Retaining too many components can
lead to overfitting and instability in membership probabilities, while
retaining too few may result in a loss of valuable information. This can
be done using the a-score (representing the difference between the
proportion of successful reassignments observed in the analysis and
values obtained using random groupings) or cross-validation. The latter
optimization procedure is most suitable and finds the optimal number of
retained PCs by splitting the data into two sets: a training (usually
90% of the data) and a validation set (typically the remaining 10%). The
validation set is selected using stratified random sampling, ensuring
that every group or population from the original dataset is represented
in both sets. DAPC is then performed on the training set while retaining
different numbers of PCs. The method then assesses how well the analysis
can predict group membership for the individuals in the validation set.
This helps to identify the optimal number of PCs to retain. The process
is repeated multiple times to improve accuracy. The final number of PCs
used in the analysis should be documented for transparency, along with
evidence supporting the choice of k (Miller et al. 2020).
A priori vs de novo clustering
DAPC can be performed either using a priori defined groupings (e.g.,
populations/species) (similar to a PCA) or it can identify genetic
clusters (k) de novo (e.g., similar to Structure). This decision can
affect the DAPC results and is therefore important to consider (Miller
et al. 2020). Misspecification of groupings can lead to artificially
large populations with Wahlund-like effects of apparent depressed
heterozygosity (Wahlund 1928), with these inflated population size
estimates preventing conservation and increasing risk of extinction for
one of “cryptic” genetic clusters, or it can lead to over splitting
populations that should be combined. De novo group assignments can be
inaccurate under conditions of high migration rates or low genetic
differentiation but is mostly reliable when migration rates are low
(Miller et al. 2020). For a priori assessments, the genetic distance
between clusters reflects underlying FST values (Miller et al. 2020).
Similar to other methods, artefactual discrete groups may be identified
in populations with continuously distributed (cline-like) genetic
diversity and under spatially heterogeneous sampling of populations.
However, incorporating scatterplots for a graphical assessment of the
genetic structures between clusters can allow to assess the organization
of genetic variability and reveal potential clines.
k-means clustering
When grouping information is unknown, DAPC can incorporate k-means
clustering to identify the optimal number of clusters k (Lee et
al. 2009, Liu & Zhao 2006). The k-means algorithm maximizes
variation between groups, and the optimal k is determined by comparing
clustering solutions using the Bayesian Information Criterion (BIC),
selecting the lowest BIC value.
Additional features
DAPC also provides membership probabilities for each individual, which
are based on the retained discriminant functions. These probabilities
differ from admixture coefficients in STRUCTURE but can similarly be
interpreted as reflecting the proximity of individuals to different
genetic clusters. They also offer insights into the distinctiveness of
identified clusters and potential admixture events. This feature makes
DAPC a potentially more powerful and accurate method for analyzing
phylogenetic relationships, hierarchical structures, and hybridization
events compared to methods like STRUCTURE, which minimize assumptions
about HWE and linkage disequilibrium (Kanno et al. 2011, Dupuis and
Sperling 2015, Jombart et al. 2010). Additionally, DAPC allows users to
identify and plot alleles that contribute most significantly to the
discrimination of clusters, providing further insight into genetic
differentiation.
Use larger number here (e.g., 200)
crossval_N <- 20 #number of cross-validation runs
genpop_data@tab <- matrix(
as.integer(round(apply(genpop_data@tab, 2, function(x) {
x[is.na(x)] <- mean(x, na.rm = T) #replace NA with column means
return(x)}))), #return modified column
nrow = nrow(genpop_data@tab), #number of rows
ncol = ncol(genpop_data@tab), #number of columns
dimnames = dimnames(genpop_data@tab)) #preserve original dimensions
original_populations <- unique(genpop_data@pop) #identify populations before filtering
genpop_data_DAPC <- genpop_data[adegenet::pop(genpop_data) %in% names(table(adegenet::pop(genpop_data))[table(adegenet::pop(genpop_data)) > 1]), ] #filter populations with fewer than 2 members
remaining_populations <- unique(genpop_data_DAPC@pop) #identify remaining populations
removed_populations <- setdiff(original_populations, remaining_populations) #identify removed populations
if (length(removed_populations) > 0) { #check if any populations were removed
cols_pop_DAPC <- cols_pop[-(which(original_populations %in% removed_populations))] #remove colors for removed populations
shapes_pop_DAPC <- shapes_pop[-which(original_populations %in% removed_populations)] #remove shapes for removed populations
} else { #no populations were removed so keep original color and shape assignments
cols_pop_DAPC <- cols_pop
shapes_pop_DAPC <- shapes_pop}
xval_results_genpop_data_DAPC_run1 <- adegenet::xvalDapc( #run1 of cross-validation
genpop_data_DAPC@tab, #genetic data matrix (allele frequencies)
adegenet::pop(genpop_data_DAPC), #population assignment (grouping factor) of individuals
training.set = 0.9, #use 90% of data for training during cross-validation
n.pca.max = ceiling(min(nrow(genpop_data_DAPC@tab), ncol(genpop_data_DAPC@tab)) * 0.6), #set maximum number of PCs to evaluate in cross-validation to 60% of total numbers of PCs
result = "groupMean", #use group means to calculate the cross-validation result
center = T, #center data before performing PCA
scale = F, #do not scale data (i.e., do not standardize allele frequencies)
n.rep = crossval_N, #set number of cross-validation replicates
xval.plot = T) #plot cross-validation results (mean squared error (MSE) vs. number of PCs)
optimal_pcs_genpop_data_DAPC_run1 <- xval_results_genpop_data_DAPC_run1[["Number of PCs Achieving Lowest MSE"]] #extract optimal number of PCs corresponding to lowest MSE from cross-validation results
print(optimal_pcs_genpop_data_DAPC_run1) #print optimal number of PCs for dataset
## [1] "25"
xval_results_genpop_data_DAPC_run2 <- adegenet::xvalDapc( #run2 of cross-validation
genpop_data_DAPC@tab, adegenet::pop(genpop_data_DAPC),
training.set = 0.9, result = "groupMean", center = T, scale = F, n.rep = crossval_N, xval.plot = T,
n.pca = seq(as.numeric(optimal_pcs_genpop_data_DAPC_run1) - 15, as.numeric(optimal_pcs_genpop_data_DAPC_run1) + 15, by = 1)) #refine PCs range
optimal_pcs_genpop_data_DAPC_run2 <- xval_results_genpop_data_DAPC_run2[["Number of PCs Achieving Lowest MSE"]] #extract refined optimal PCs
print(xval_results_genpop_data_DAPC_run2[2:6]) #print cross-validation results
## $`Median and Confidence Interval for Random Chance`
## 2.5% 50% 97.5%
## 0.08157054 0.13997273 0.19957360
##
## $`Mean Successful Assignment by Number of PCs of PCA`
## 10 11 12 13 14 15 16 17
## 0.9000000 0.9000000 0.8785714 0.8571429 0.8535714 0.9214286 0.8535714 0.8785714
## 18 19 20 21 22 23 24 25
## 0.8500000 0.8500000 0.9285714 0.9428571 0.9321429 0.9507143 0.9185714 0.9500000
## 26 27 28 29 30 31 32 33
## 0.9285714 0.9392857 0.9607143 0.9321429 0.9107143 0.9592857 0.9321429 0.9150000
## 34 35 36 37 38 39 40
## 0.9507143 0.9471429 0.9271429 0.9357143 0.9328571 0.8885714 0.9771429
##
## $`Number of PCs Achieving Highest Mean Success`
## [1] "40"
##
## $`Root Mean Squared Error by Number of PCs of PCA`
## 10 11 12 13 14 15 16
## 0.12975644 0.12371791 0.14638501 0.16751485 0.17713710 0.10350983 0.16366342
## 17 18 19 20 21 22 23
## 0.15319722 0.19561520 0.18210784 0.10594569 0.10101525 0.10714286 0.08695530
## 24 25 26 27 28 29 30
## 0.10907777 0.10101525 0.11293849 0.09449112 0.08601139 0.10473484 0.13646409
## 31 32 33 34 35 36 37
## 0.07686458 0.09715336 0.11216788 0.08348286 0.08736506 0.10613815 0.09583148
## 38 39 40
## 0.11640885 0.14178167 0.05091008
##
## $`Number of PCs Achieving Lowest MSE`
## [1] "40"
dapc_result_genpop_data <- adegenet::dapc(
genpop_data_DAPC, pop = adegenet::pop(genpop_data_DAPC),
n.pca = as.numeric(optimal_pcs_genpop_data_DAPC_run2),
n.da = length(levels(genpop_data_DAPC$pop)) - 1) #perform DAPC with optimal PCs (determined by crossvalidation)
save(xval_results_genpop_data_DAPC_run1, optimal_pcs_genpop_data_DAPC_run1,
xval_results_genpop_data_DAPC_run2, optimal_pcs_genpop_data_DAPC_run2,
dapc_result_genpop_data,
file = "DAPC_results_genpop_data.Rdata")
load("DAPC_results_genpop_data.Rdata")
par(mar = c(5, 5, 3, 3))
print(round(dapc_result_genpop_data$var, 2) * 100) #conserved variance in percent
## [1] 64
barplot(dapc_result_genpop_data$eig, names.arg = paste0("LD", seq_along(dapc_result_genpop_data$eig)), xlab = "Linear discriminants", ylab = "Eigenvalues") #plot eigenvalues of discriminant functions
round(dapc_result_genpop_data$eig, 1) #show eigenvalues of discriminant functions
## [1] 2597.9 1394.0 611.9 511.6 138.8 72.6
dapc_relative_LDs <- round(dapc_result_genpop_data$eig / sum(dapc_result_genpop_data$eig) * 100, 1)
barplot(dapc_relative_LDs, #plot relative proportions of eigenvalues of discriminant functions
names.arg = paste0("LD", seq_along(dapc_result_genpop_data$eig)),
xlab = "Linear discriminants", ylab = "Relative proportion of eigenvalues [%]")
LD1_label <- paste("LD1 (", dapc_relative_LDs[1], "%)", sep = "") #create axis labels for LD1 showing relative proportions of eigenvalues of discriminant functions in percent
LD2_label <- paste("LD2 (", dapc_relative_LDs[2], "%)", sep = "") #create axis labels for LD2 showing relative proportions of eigenvalues of discriminant functions in percent
LD3_label <- paste("LD3 (", dapc_relative_LDs[3], "%)", sep = "") #create axis labels for LD3 showing relative proportions of eigenvalues of discriminant functions in percent
dapc_df_genpop_data <- data.frame(dapc_result_genpop_data$ind.coord, group = factor(dapc_result_genpop_data$grp, levels = levels(population_assignment))) #create dataframe for visualization
svg("DAPC_LD1_LD2_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_df_genpop_data, aes(x = LD1, y = LD2)) + #DAPC scatterplot of LD1 vs LD2
geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
scale_shape_manual(values = shapes_pop_DAPC) +
scale_fill_manual(values = cols_pop_DAPC) +
labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD2_label) +
theme_classic() +
guides(override.aes = list(alpha = 1))
dev.off()
svg("DAPC_LD1_LD3_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_df_genpop_data, aes(x = LD1, y = LD3)) + #DAPC scatterplot of LD1 vs LD3
geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
scale_shape_manual(values = shapes_pop_DAPC) +
scale_fill_manual(values = cols_pop_DAPC) +
labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD3_label) +
theme_classic() +
guides(override.aes = list(alpha = 1))
dev.off()
plotly::plot_ly( #3D plot of DAPC using LD1-LD3
data = dapc_df_genpop_data,
x = ~LD1, y = ~LD2, z = ~LD3,
color = ~group, colors = cols_pop_DAPC,
type = 'scatter3d', mode = 'markers',
alpha = point_alpha_3D_plot, #marker transparency
marker = list(size = point_size_3D_plot)) #marker size
dapc_group_memberships <- as.data.frame(dapc_result_genpop_data$posterior) #convert posterior probabilities to data frame
dapc_group_memberships$ID <- row.names(dapc_group_memberships) #add ID column
dapc_group_memberships$Population <- genpop_data$pop[match(dapc_group_memberships$ID, indNames(genpop_data))] #match populations to IDs
dapc_group_memberships_long <- melt(dapc_group_memberships, id.vars = c("ID", "Population"), variable.name = "Cluster", value.name = "Probability") #reshape to long format
svg("DAPC_membership_probabilities.svg", width = 35/2.56, height = 30/2.56)
ggplot(dapc_group_memberships_long, aes(x = Probability, y = ID, fill = Cluster)) + #create Structure-like plots
geom_bar(stat = "identity") +
theme_classic() +
scale_fill_manual(values = cols_pop_DAPC) + #use viridis color palette
facet_wrap(~ Population, scales = "free_y") + #split y-axis by population
labs(fill = "Cluster") +
theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 4.1), #modify size of y-axis labels
axis.text.x = element_blank(), #remove x-axis labels
axis.title.y = element_blank(), #remove y-axis title
axis.title.x = element_blank(), #remove x-axis title
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
dev.off()
dapc_loading_LD1 <- adegenet::loadingplot(dapc_result_genpop_data$var.contr[, 1], lab.jitter = 100, #plot loading for first LD axis
thres = quantile(dapc_result_genpop_data$var.contr[, 1], 0.99)) #set threshold to 99th percentile (top 1% of loadings)
dapc_loading_LD2 <- adegenet::loadingplot(dapc_result_genpop_data$var.contr[, 2], lab.jitter = 100, #plot loading for second LD axis
thres = quantile(dapc_result_genpop_data$var.contr[, 2], 0.99)) #set threshold to 99th percentile (top 1% of loadings)
dapc_loading_LD1$var.names #print most differentiated alleles for LD1
## [1] "209826:27:+.0" "209826:27:+.1" "347604:17:+.0" "347604:17:+.1"
## [5] "548637:28:+.0" "548637:28:+.1" "607571:65:-.0" "607571:65:-.1"
## [9] "623498:13:+.0" "623498:13:+.1" "647037:10:+.0" "647037:10:+.1"
## [13] "686382:4:-.1" "697527:53:-.0" "697527:53:-.1" "707554:19:-.0"
## [17] "707554:19:-.1" "750141:18:-.0" "750141:18:-.1" "761132:51:-.0"
## [21] "761132:51:-.1" "762990:31:+.0" "762990:31:+.1" "917171:7:+.0"
## [25] "917171:7:+.1" "927164:6:+.0" "927164:6:+.1" "959252:10:+.0"
## [29] "959252:10:+.1" "1021247:13:-.1" "1021247:13:-.0" "1034909:8:-.0"
## [33] "1034909:8:-.1" "1050155:10:-.0" "1050155:10:-.1" "1079236:33:-.0"
## [37] "1079236:33:-.1" "1084025:16:-.0" "1084025:16:-.1" "1179508:42:-.0"
## [41] "1179508:42:-.1" "1294262:5:+.0" "1294262:5:+.1" "1315236:34:-.0"
## [45] "1315236:34:-.1" "1697906:21:+.0" "1697906:21:+.1" "1697966:7:-.0"
## [49] "1697966:7:-.1" "1702204:25:+.0" "1702204:25:+.1" "1739335:15:+.0"
## [53] "1739335:15:+.1" "1748724:12:+.0" "1748724:12:+.1" "1875485:36:+.0"
## [57] "1875485:36:+.1" "2004857:55:-.1" "2004857:55:-.0" "2011745:23:-.0"
## [61] "2011745:23:-.1" "2127415:13:+.0" "2127415:13:+.1" "2147586:28:-.0"
## [65] "2147586:28:-.1" "2202785:6:-.0" "2202785:6:-.1" "2702045:14:+.0"
## [69] "2702045:14:+.1" "2716161:8:-.0" "2716161:8:-.1" "2816714:11:-.0"
## [73] "2816714:11:-.1" "2841083:17:-.1" "2841083:17:-.0" "2939340:6:+.0"
## [77] "2939340:6:+.1" "3134272:9:+.0" "3134272:9:+.1" "3177228:11:+.0"
## [81] "3177228:11:+.1" "3278150:15:+.0" "3278150:15:+.1" "3293069:11:+.0"
## [85] "3293069:11:+.1" "3322807:5:-.0" "3322807:5:-.1" "3361896:11:+.0"
## [89] "3361896:11:+.1" "3394277:14:+.0" "3394277:14:+.1" "3408317:12:-.0"
## [93] "3408317:12:-.1"
dapc_loading_LD2$var.names #print most differentiated alleles for LD2
## [1] "131926:18:-.0" "131926:18:-.1" "151044:32:-.0" "151044:32:-.1"
## [5] "214427:4:-.0" "214427:4:-.1" "289798:19:-.0" "289798:19:-.1"
## [9] "320857:36:+.0" "320857:36:+.1" "383196:13:-.1" "383196:13:-.0"
## [13] "394990:26:+.0" "394990:26:+.1" "477702:9:-.0" "477702:9:-.1"
## [17] "621013:10:-.0" "621013:10:-.1" "647037:10:+.0" "647037:10:+.1"
## [21] "683141:30:-.0" "683141:30:-.1" "747425:5:+.0" "747425:5:+.1"
## [25] "843409:4:-.0" "843409:4:-.1" "963094:5:+.0" "963094:5:+.1"
## [29] "1097445:13:-.0" "1097445:13:-.1" "1264640:11:+.0" "1264640:11:+.1"
## [33] "1265194:10:+.0" "1265194:10:+.1" "1274354:6:-.0" "1370346:14:-.0"
## [37] "1370346:14:-.1" "1377033:39:-.0" "1377033:39:-.1" "1383308:16:-.0"
## [41] "1383308:16:-.1" "1444380:14:-.0" "1444380:14:-.1" "1613775:9:+.1"
## [45] "1613775:9:+.0" "1666926:22:+.0" "1666926:22:+.1" "1702076:29:-.1"
## [49] "1702076:29:-.0" "1757045:30:-.0" "1757045:30:-.1" "1783088:9:+.0"
## [53] "1783088:9:+.1" "1784216:10:-.0" "1784216:10:-.1" "1882276:9:-.0"
## [57] "1882276:9:-.1" "1912415:4:+.0" "1912415:4:+.1" "2028920:8:+.0"
## [61] "2028920:8:+.1" "2095604:4:+.0" "2095604:4:+.1" "2151331:14:-.0"
## [65] "2151331:14:-.1" "2175590:6:-.0" "2175590:6:-.1" "2186599:17:-.0"
## [69] "2186599:17:-.1" "2393970:12:+.0" "2393970:12:+.1" "2486135:9:-.0"
## [73] "2486135:9:-.1" "2511053:6:-.0" "2511053:6:-.1" "2625995:34:+.0"
## [77] "2625995:34:+.1" "2692085:33:+.0" "2692085:33:+.1" "2871045:6:+.0"
## [81] "2871045:6:+.1" "2971377:6:-.0" "2971377:6:-.1" "3059401:17:+.0"
## [85] "3059401:17:+.1" "3167391:9:-.0" "3167391:9:-.1" "3306116:7:+.0"
## [89] "3306116:7:+.1" "3402547:9:+.0" "3402547:9:+.1" "3421689:6:+.0"
## [93] "3421689:6:+.1"
intersect(dapc_loading_LD1$var.names, dapc_loading_LD2$var.names) #show differentiated alleles for both LD1 and LD2
## [1] "647037:10:+.0" "647037:10:+.1"
dapc_group_memberships <- as.data.frame(dapc_result_genpop_data$posterior) #convert posterior probabilities to data frame
subset_size <- 40 #define subset size
plot_assignments <- function(dapc_result, subset_size, max_rows) { #plot group memberships (heat colors represent membership probabilities with red=1 and white=0, blue crosses represent prior Cluster_assignment provided to DAPC)
num_plots <- ceiling(max_rows / subset_size) #calculate number of plots needed
for (i in 0:(num_plots - 1)) {
start_row <- i * subset_size + 1 #start index for subset
end_row <- min((i + 1) * subset_size, max_rows) #end index for subset
cat("Plotting rows", start_row, "to", end_row, "\n") #print status
adegenet::assignplot(dapc_result, subset = start_row:end_row)}} #plot assignments for current subset
par(mar = c(5, 13, 3, 3))
plot_assignments(dapc_result_genpop_data, subset_size, nrow(dapc_group_memberships)) #plot all subsets
## Plotting rows 1 to 40
## Plotting rows 41 to 80
## Plotting rows 81 to 118
dapc_result_genpop_data$grp <- factor(dapc_result_genpop_data$grp, levels = unique(genpop_data_DAPC$pop))
admixed_individuals <- which(apply(dapc_result_genpop_data$posterior, 1, function(e) all(e < 0.80))) #identify admixed individuals (posterior probabilities less than 0.80)
admixed_individuals #show admixed individuals
## named integer(0)
if (length(admixed_individuals) > 0) {
adegenet::compoplot(dapc_result_genpop_data, col.pal = cols_pop_DAPC, subset = admixed_individuals)}
## Create function to plot sample map with cluster assignment for each individual
plot.Map.assig.prob <- function(ancestry_matrix,
Coordinates, #coordinates as "Latitude" and "Longitude" columns in dataframe/matrix
lat.buffer.range = 2, #add coordinates as buffer range around latitude coordinates
lon.buffer.range = 2, #add coordinates as buffer range around longitude coordinates
save = F, #whether to save plot or not
overwrite = T, #whether to overwrite if file exists
file.name = NULL, #plot file name (NULL = default file name)
type = "svg", #plot format type (options: "png", "svg", "jpg")
width = 15, #plot width in cm
height = 20, #plot height in cm
resolution = 300, #plot resolution in dpi
pie.size = 1.9, #pie chart size
pie.col.pal, #color palette of pie charts
USA.add.states = T, #option to add US states (only works if range includes USA)
USA.add.counties = F, #option to add US counties (only works if range includes USA)
USA.state.lwd = 0.5, #linewidth of US state borders (only works if range includes USA)
USA.county.lwd = 0.3, #linewidth of US county borders (only works if range includes USA)
north.arrow.position = c(0.03, 0.88), #position (x, y) of north arrow relative to map
north.arrow.length = 0.7, #length of north arrow
north.arrow.lwd = 2, #linewidth of north arrow
north.arrow.N.position = 0.3, #position of north arrow "N"
north.arrow.N.size = 1, #size of north arrow "N"
scale.position = c(0.03, 0.05), #relative position (x, y) of scale
scale.size = 0.16, #size of scale
scale.font.size = 0.54, #font size of scale text
legend.position = "topright", #position of legend
legend.cluster.names = NULL, #names of clusters in legend (if NULL, default is used, otherwise use vector with length of number of clusters)
legend.font.size = 1, #font size of legend text
legend.box = T, #create white box around legend
legend.symbol.size = 1.5) { #size of legend symbols
# Ensure ancestry_matrix is valid
if (is.null(ancestry_matrix) || !is.matrix(ancestry_matrix)) {
stop("ancestry_matrixis not valid")
}
# Check if Coordinates is data frame or matrix
if (!is.data.frame(Coordinates) && !is.matrix(Coordinates)) {
stop("Coordinates must be a data frame or matrix")
}
# Convert matrix to data frame if necessary
if (is.matrix(Coordinates)) {
Coordinates <- as.data.frame(Coordinates)
}
# Check if Coordinates contain Latitude and Longitude
if (!all(c("Latitude", "Longitude") %in% names(Coordinates))) {
stop("Coordinates must contain 'Latitude' and 'Longitude' columns")
}
# Check for NA values in Latitude and Longitude
if (any(is.na(Coordinates$Latitude)) || any(is.na(Coordinates$Longitude))) {
stop("Coordinates must not contain NA values in 'Latitude' or 'Longitude' columns")
}
# Prepare ancestry matrix
q_matrix = as.data.frame(ancestry_matrix) #convert ancestry_matrix to dataframe
q_matrix$Ind = rownames(ancestry_matrix) #add sample names
# Check if number of rows in ancestry matrix matches number of coordinates
if (nrow(q_matrix) != nrow(Coordinates)) {
stop("Number of samples in ancestry_matrix does not match number of samples in Coordinates")
}
# Define color palette for pie charts
k.cols = pie.col.pal
# Define map boundaries
lon_min = min(Coordinates$Longitude) - lon.buffer.range
lon_max = max(Coordinates$Longitude) + lon.buffer.range
lat_min = min(Coordinates$Latitude) - lat.buffer.range
lat_max = max(Coordinates$Latitude) + lat.buffer.range
# Create plot
if (dev.cur() > 1) {
dev.off() #close current device if open to avoid unwanted graphic distortions and other effects
}
# Set plot saving
if (save) {
# Set file name for saving
if (is.null(file.name)) {
file.name <- paste0("Assignment_probability_map_plot.", type)
}
# Set overwriting
if (file.exists(file.name) && !overwrite) {
stop(paste(file.name, "already exists - set overwrite = T to overwrite"))
}
# Set file type
if (type == "svg") {
svg(file.name,
width = width / 2.54,
height = height / 2.54)
} else if (type == "png") {
png(file.name,
width = width,
height = height,
res = resolution,
units = "cm")
} else if (type == "jpg") {
jpeg(file.name,
width = width,
height = height,
res = resolution,
units = "cm")
} else {
stop("Unsupported plot file type - choose from 'svg', 'png', or 'jpg'")
}
}
# Set layout and margins
par(mfrow = c(1, 1),
mar = c(1, 1, 1, 1))
# Create map
maps::map("world",
fill = T,
col = "lightgrey",
xlim = c(lon_min, lon_max),
ylim = c(lat_min, lat_max))
maps::map.axes()
# Add US counties if requested
if (USA.add.counties) {
maps::map("county",
add = T,
col = "grey",
lwd = USA.county.lwd)
}
# Add US states if requested
if (USA.add.states) {
maps::map("state",
add = T,
col = "black",
lwd = USA.state.lwd)
}
# Add pie charts to map
for (i in 1:nrow(q_matrix)) {
coords = matrix(c(Coordinates$Longitude[i], Coordinates$Latitude[i]), ncol = 2, byrow = T)
make.admix.pie.plot(admix.proportions = as.matrix(q_matrix[i, -ncol(q_matrix), drop = F]),
coords = coords,
layer.colors = k.cols,
radii = pie.size,
add = T)
}
# Define legend labels
if (is.null(legend.cluster.names)) {
legend.labels <- paste("Cluster", 1:length(k.cols)) #set default labels
} else {
if (length(legend.cluster.names) != length(k.cols)) {
print("Length of legend.cluster.names must match number of clusters - default labels are used")
legend.labels <- paste("Cluster", 1:length(k.cols)) #default labels
}
legend.labels <- legend.cluster.names #use custom labels
}
# Set legend box
if (legend.box) {
legend_box <- "o"
} else {
legend_box <- "n"
}
# Add legend
legend(legend.position,
legend = legend.labels,
pch = 21,
cex = legend.font.size,
pt.cex = legend.symbol.size,
pt.bg = k.cols,
bty = legend_box)
# Add scale
scale_position_x = scale.position[1] * (lon_max - lon_min) + lon_min
scale_position_y = scale.position[2] * (lat_max - lat_min) + lat_min
maps::map.scale(x = scale_position_x,
y = scale_position_y,
cex = scale.font.size,
relwidth = scale.size,
ratio = F)
# Add north arrow
north_arrow_x = north.arrow.position[1] * (lon_max - lon_min) + lon_min
north_arrow_y = north.arrow.position[2] * (lat_max - lat_min) + lat_min
arrows(x0 = north_arrow_x,
y0 = north_arrow_y,
x1 = north_arrow_x,
y1 = north_arrow_y + north.arrow.length,
length = 0.13,
col = "black",
lwd = north.arrow.lwd)
# Add North "N" above north arrow
text(x = north_arrow_x,
y = north_arrow_y + north.arrow.length + north.arrow.N.position, #adjust position for "N"
labels = "N",
cex = north.arrow.N.size,
col = "black")
# Close graphics device
if (save) {
dev.off()
message(paste("Plot", ifelse(overwrite, "overwritten to", "saved to"), file.name))
}
}
## Plot map with DAPC assignment probabilities
common_ids <- intersect(rownames(dapc_result_genpop_data$posterior), dataset$ID) #extract shared IDs
dapc_coordinates <- dataset[match(common_ids, dataset$ID), c("Longitude", "Latitude")] #subset and reorder coordinates
dapc_anestry_matrix <- dapc_result_genpop_data$posterior[common_ids, , drop = FALSE]
plot.Map.assig.prob(ancestry_matrix = dapc_anestry_matrix,
Coordinates = dapc_coordinates,
legend.cluster.names = colnames(dapc_anestry_matrix),
lon.buffer.range = 5,
lat.buffer.range = 5,
legend.position = "bottomright",
save = T,
type = "svg",
file.name = "DAPC_map.svg",
pie.size = 2.3,
pie.col.pal = cols_pop_DAPC,
overwrite = T,
width = 30,
height = 20,
north.arrow.length = 1.8,
legend.font.size = 1, #font size of legend text
north.arrow.N.position = 0.7,
legend.symbol.size = 1.5,
north.arrow.position = c(0.99, 0.35))
## Plot overwritten to DAPC_map.svg
Leave out “n.clust” option in find.clusters to perform kmeans clustering and search for best k using BIC (lowest BIC is best K; if several K with similar low BIC, choose first one representing bend in BIC curve). Here I fixed k = 5 using “n.clust” based on a prior search (not shown).
clusters_assignment_genpop_data_DAPC <- as.data.frame(clusters_kmeans_genpop_data_DAPC$grp) #assignment of each individual to the chosen number of clusters
colnames(clusters_assignment_genpop_data_DAPC)[1] <- "Cluster_ID" #rename first column
clusters_kmeans_genpop_data_DAPC$size #number of individuals assigned to each Cluster_assignment
## [1] 64 18 11 16 9
genpop_data_DAPC@tab <- matrix(
as.integer(round(apply(genpop_data_DAPC@tab, 2, function(x) {
x[is.na(x)] <- mean(x, na.rm = T) #replace NA with column means
return(x)}))), #return modified column
nrow = nrow(genpop_data_DAPC@tab), #number of rows
ncol = ncol(genpop_data_DAPC@tab), #number of columns
dimnames = dimnames(genpop_data_DAPC@tab)) #preserve original dimensions
xval_results_genpop_data_DAPC_kmeans_run1 <- adegenet::xvalDapc( #run1
genpop_data_DAPC@tab, #genetic data matrix (allele frequencies)
clusters_assignment_genpop_data_DAPC$Cluster_ID, #population assignment (grouping factor) of individuals using Cluster_assignment IDs
training.set = 0.9, #use 90% of data for training during cross-validation
n.pca.max = ceiling(min(nrow(genpop_data_DAPC@tab), ncol(genpop_data_DAPC@tab)) * 0.9), #set maximum number of PCs to evaluate in cross-validation to 90% of total numbers of PCs
result = "groupMean", #use group means to calculate cross-validation result
center = T, #center data before performing PCA
scale = F, #do not scale data (i.e., do not standardize allele frequencies)
n.rep = crossval_N, #set number of cross-validation replicates
xval.plot = T) #plot cross-validation results (mean squared error (MSE) vs. number of PCs)
optimal_pcs_genpop_data_DAPC_kmeans_run1 <- xval_results_genpop_data_DAPC_kmeans_run1[["Number of PCs Achieving Lowest MSE"]] #extract optimal PCs
print(optimal_pcs_genpop_data_DAPC_kmeans_run1) #print optimal PCs
## [1] "60"
xval_results_genpop_data_DAPC_kmeans_run2 <- adegenet::xvalDapc( #run2
genpop_data_DAPC@tab, clusters_assignment_genpop_data_DAPC$Cluster_ID, training.set = 0.9,
result = "groupMean", center = T, scale = F, n.rep = crossval_N, xval.plot = T,
n.pca = seq(as.numeric(optimal_pcs_genpop_data_DAPC_kmeans_run1) - 15,
as.numeric(optimal_pcs_genpop_data_DAPC_kmeans_run1) + 15, by = 1)) #refine PCs range based on run 1
optimal_pcs_genpop_data_DAPC_kmeans_run2 <- xval_results_genpop_data_DAPC_kmeans_run2[["Number of PCs Achieving Lowest MSE"]] #extract refined optimal PCs
print(xval_results_genpop_data_DAPC_kmeans_run2[2:6]) #print cross-validation results
## $`Median and Confidence Interval for Random Chance`
## 2.5% 50% 97.5%
## 0.1325316 0.1995581 0.2756203
##
## $`Mean Successful Assignment by Number of PCs of PCA`
## 45 46 47 48 49 50 51 52
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## 53 54 55 56 57 58 59 60
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.9971429 1.0000000
## 61 62 63 64 65 66 67 68
## 0.9985714 1.0000000 1.0000000 1.0000000 0.9885714 0.9971429 0.9950000 0.9935714
## 69 70 71 72 73 74 75
## 0.9835714 0.9935714 0.9921429 0.9871429 0.9564286 0.9621429 0.9764286
##
## $`Number of PCs Achieving Highest Mean Success`
## [1] "45"
##
## $`Root Mean Squared Error by Number of PCs of PCA`
## 45 46 47 48 49 50
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 51 52 53 54 55 56
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 57 58 59 60 61 62
## 0.000000000 0.000000000 0.012777531 0.000000000 0.006388766 0.000000000
## 63 64 65 66 67 68
## 0.000000000 0.000000000 0.032261685 0.009035079 0.022360680 0.023255458
## 69 70 71 72 73 74
## 0.039253233 0.028749445 0.029450754 0.032888184 0.075390142 0.056874369
## 75
## 0.051804184
##
## $`Number of PCs Achieving Lowest MSE`
## [1] "64"
dapc_result_genpop_data_kmeans <- dapc(genpop_data_DAPC, pop = clusters_assignment_genpop_data_DAPC$Cluster_ID,
n.pca = as.numeric(optimal_pcs_genpop_data_DAPC_kmeans_run2), #optimal number of PCs from cross-validation
n.da = length(unique(clusters_assignment_genpop_data_DAPC$Cluster_ID)) - 1) #number of discriminant functions (clusters - 1)
save(xval_results_genpop_data_DAPC_kmeans_run1, optimal_pcs_genpop_data_DAPC_kmeans_run1,
xval_results_genpop_data_DAPC_kmeans_run2, optimal_pcs_genpop_data_DAPC_kmeans_run2,
dapc_result_genpop_data_kmeans,
file = "DAPC_results_genpop_data_kmeans.Rdata")
load("DAPC_results_genpop_data_kmeans.Rdata")
print(round(dapc_result_genpop_data_kmeans$var, 2) * 100) #conserved variance in percent
## [1] 80
par(mar = c(5, 5, 3, 3))
barplot(dapc_result_genpop_data_kmeans$eig, names.arg = paste0("LD", seq_along(dapc_result_genpop_data_kmeans$eig)), #plot eigenvalues of discriminant functions
xlab = "Linear discriminants", ylab = "Eigenvalues")
round(dapc_result_genpop_data_kmeans$eig, 1) #show eigenvalues of discriminant functions
## [1] 6234.2 5672.2 3588.7 2535.2
dapc_kmeans_relative_LDs <- round(dapc_result_genpop_data_kmeans$eig / sum(dapc_result_genpop_data_kmeans$eig) * 100, 1)
barplot(dapc_kmeans_relative_LDs, #plot relative proportions of eigenvalues of discriminant functions
names.arg = paste0("LD", seq_along(dapc_result_genpop_data_kmeans$eig)),
xlab = "Linear discriminants", ylab = "Relative proportion of eigenvalues [%]")
cols_pop_DAPC_kmeans <- viridis::viridis(n = length(unique(clusters_assignment_genpop_data_DAPC$Cluster_ID)), begin = 0, end = 1) #define color palette
shapes_pop_DAPC_kmeans <- rep(c(21, 24, 22, 25), length.out = length(unique(clusters_assignment_genpop_data_DAPC$Cluster_ID))) #define shape palette
dapc_kmeans_df_genpop_data <- data.frame(dapc_result_genpop_data_kmeans$ind.coord, group = dapc_result_genpop_data_kmeans$grp) #create dataframe for visualization
LD1_label <- paste("LD1 (", dapc_kmeans_relative_LDs[1], "%)", sep = "") #create axis labels for LD1 showing relative proportions of eigenvalues of discriminant functions in percent
LD2_label <- paste("LD2 (", dapc_kmeans_relative_LDs[2], "%)", sep = "") #create axis labels for LD2 showing relative proportions of eigenvalues of discriminant functions in percent
LD3_label <- paste("LD3 (", dapc_kmeans_relative_LDs[3], "%)", sep = "") #create axis labels for LD3 showing relative proportions of eigenvalues of discriminant functions in percent
svg("DAPC_kmeans_LD1_LD2_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_kmeans_df_genpop_data, aes(x = LD1, y = LD2)) + #kmeans DAPC scatterplot of LD1 vs LD2
geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
scale_shape_manual(values = shapes_pop_DAPC_kmeans) +
scale_fill_manual(values = cols_pop_DAPC_kmeans) +
labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD2_label) +
theme_classic() +
guides(override.aes = list(alpha = 1)) +
coord_equal()
dev.off()
svg("DAPC_kmeans_LD1_LD3_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_kmeans_df_genpop_data, aes(x = LD1, y = LD3)) + #kmeans DAPC scatterplot of LD1 vs LD3
geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
scale_shape_manual(values = shapes_pop_DAPC_kmeans) +
scale_fill_manual(values = cols_pop_DAPC_kmeans) +
labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD3_label) +
theme_classic() +
guides(override.aes = list(alpha = 1))
dev.off()
plotly::plot_ly( #3D plot of DAPC_kmeans using LD1-LD3
data = dapc_kmeans_df_genpop_data,
x = ~LD1, y = ~LD2, z = ~LD3,
color = ~group, colors = cols_pop_DAPC_kmeans,
type = 'scatter3d', mode = 'markers',
alpha = point_alpha_3D_plot, #marker transparency
marker = list(size = point_size_3D_plot)) #marker size
dapc_kmeans_group_memberships <- as.data.frame(dapc_result_genpop_data_kmeans$posterior) #convert posterior probabilities to data frame
dapc_kmeans_group_memberships$ID <- row.names(dapc_kmeans_group_memberships) #add ID column
dapc_kmeans_group_memberships$Population <- genpop_data$pop[match(dapc_kmeans_group_memberships$ID, indNames(genpop_data))] #match populations to IDs
dapc_kmeans_group_memberships_long <- melt(dapc_kmeans_group_memberships, id.vars = c("ID", "Population"), variable.name = "Cluster", value.name = "Probability") #reshape to long format
svg("DAPC_kmeans_Membership_probabilities.svg", width = 30/2.56, height = 30/2.56)
ggplot(dapc_kmeans_group_memberships_long, aes(x = Probability, y = ID, fill = Cluster)) + #create Structure-like plots
geom_bar(stat = "identity") +
theme_classic() +
scale_fill_manual(values = cols_pop_DAPC_kmeans) + #use viridis color palette
facet_wrap(~ Population, scales = "free_y") + #split y-axis by population
labs(fill = "Cluster") +
theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 3.3), axis.text.x = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.background = element_blank(), plot.background = element_blank())
dev.off()
##### Assess allele differentiation
dapc_kmeans_loading_LD1 <- adegenet::loadingplot(dapc_result_genpop_data_kmeans$var.contr[, 1], lab.jitter = 100, #plot loading for first LD axis
thres = quantile(dapc_result_genpop_data_kmeans$var.contr[, 1], 0.99)) #set threshold to 99th percentile (top 1% of loadings)
dapc_kmeans_loading_LD2 <- adegenet::loadingplot(dapc_result_genpop_data_kmeans$var.contr[, 2], lab.jitter = 100, #plot loading for second LD axis
thres = quantile(dapc_result_genpop_data_kmeans$var.contr[, 2], 0.99)) #set threshold to 99th percentile (top 1% of loadings)
dapc_kmeans_loading_LD1$var.names #print most differentiated alleles for LD1
## [1] "95445:20:+.0" "95445:20:+.1" "98412:7:-.0" "98412:7:-.1"
## [5] "186168:57:-.0" "186168:57:-.1" "361864:6:+.0" "361864:6:+.1"
## [9] "463883:8:-.0" "463883:8:-.1" "527307:6:+.0" "527307:6:+.1"
## [13] "587353:5:+.0" "587353:5:+.1" "651105:43:-.0" "651105:43:-.1"
## [17] "771979:45:+.0" "771979:45:+.1" "776727:19:-.0" "776727:19:-.1"
## [21] "834920:17:-.0" "834920:17:-.1" "1057500:18:-.0" "1057500:18:-.1"
## [25] "1190182:13:-.0" "1190182:13:-.1" "1197982:37:-.0" "1197982:37:-.1"
## [29] "1256354:9:-.0" "1256354:9:-.1" "1326541:35:-.1" "1326541:35:-.0"
## [33] "1398359:49:-.0" "1398359:49:-.1" "1398440:37:+.0" "1398440:37:+.1"
## [37] "1448846:21:-.0" "1448846:21:-.1" "1478384:6:+.0" "1478384:6:+.1"
## [41] "1577083:16:+.0" "1577083:16:+.1" "1644575:15:+.0" "1644575:15:+.1"
## [45] "1648234:11:-.1" "1648234:11:-.0" "1684898:43:+.0" "1684898:43:+.1"
## [49] "1686749:18:-.0" "1686749:18:-.1" "1825465:38:-.0" "1825465:38:-.1"
## [53] "1848422:9:-.0" "1848422:9:-.1" "2013619:6:-.0" "2013619:6:-.1"
## [57] "2026394:30:-.0" "2026394:30:-.1" "2028920:8:+.0" "2028920:8:+.1"
## [61] "2041339:24:+.0" "2041339:24:+.1" "2044034:20:+.1" "2044034:20:+.0"
## [65] "2219166:4:-.0" "2219166:4:-.1" "2336237:41:+.0" "2336237:41:+.1"
## [69] "2418655:18:+.0" "2418655:18:+.1" "2425420:38:-.0" "2425420:38:-.1"
## [73] "2527226:5:-.0" "2527226:5:-.1" "2716161:8:-.0" "2716161:8:-.1"
## [77] "2896503:13:+.0" "2896503:13:+.1" "2914166:4:-.0" "2914166:4:-.1"
## [81] "2984726:5:-.0" "2984726:5:-.1" "3014454:28:-.0" "3014454:28:-.1"
## [85] "3061381:9:-.0" "3061381:9:-.1" "3103167:9:+.1" "3103167:9:+.0"
## [89] "3232852:11:-.0" "3232852:11:-.1" "3408274:20:+.0" "3408274:20:+.1"
dapc_kmeans_loading_LD2$var.names #print most differentiated alleles for LD2
## [1] "209826:27:+.0" "209826:27:+.1" "292767:9:-.0" "292767:9:-.1"
## [5] "396591:27:-.0" "396591:27:-.1" "686382:4:-.0" "686382:4:-.1"
## [9] "691278:5:-.0" "691278:5:-.1" "703009:18:-.0" "703009:18:-.1"
## [13] "750141:18:-.0" "750141:18:-.1" "759739:26:+.1" "759739:26:+.0"
## [17] "761132:51:-.0" "761132:51:-.1" "776727:19:-.0" "776727:19:-.1"
## [21] "917171:7:+.0" "917171:7:+.1" "924947:8:-.0" "924947:8:-.1"
## [25] "927164:6:+.0" "927164:6:+.1" "959252:10:+.0" "959252:10:+.1"
## [29] "1050155:10:-.0" "1050155:10:-.1" "1073232:13:+.0" "1073232:13:+.1"
## [33] "1117117:18:+.0" "1117117:18:+.1" "1135134:12:+.0" "1135134:12:+.1"
## [37] "1261020:19:-.0" "1261020:19:-.1" "1335232:9:-.0" "1335232:9:-.1"
## [41] "1549631:7:+.0" "1549631:7:+.1" "1666926:22:+.0" "1666926:22:+.1"
## [45] "1697966:7:-.0" "1697966:7:-.1" "1739335:15:+.0" "1739335:15:+.1"
## [49] "1789990:10:+.0" "1789990:10:+.1" "1791591:9:-.0" "1791591:9:-.1"
## [53] "2004857:55:-.1" "2004857:55:-.0" "2011745:23:-.0" "2011745:23:-.1"
## [57] "2127415:13:+.0" "2127415:13:+.1" "2147586:28:-.0" "2147586:28:-.1"
## [61] "2202785:6:-.0" "2202785:6:-.1" "2342825:6:+.0" "2342825:6:+.1"
## [65] "2559347:4:-.0" "2559347:4:-.1" "2702045:14:+.0" "2702045:14:+.1"
## [69] "2716161:8:-.0" "2716161:8:-.1" "2816714:11:-.0" "2816714:11:-.1"
## [73] "2909766:57:-.0" "2909766:57:-.1" "2939340:6:+.0" "2939340:6:+.1"
## [77] "3048695:18:-.0" "3048695:18:-.1" "3074851:7:+.0" "3074851:7:+.1"
## [81] "3103167:9:+.1" "3103167:9:+.0" "3134272:9:+.0" "3134272:9:+.1"
## [85] "3177228:11:+.0" "3177228:11:+.1" "3278150:15:+.0" "3278150:15:+.1"
## [89] "3322807:5:-.0" "3322807:5:-.1" "3329625:9:+.0" "3329625:9:+.1"
intersect(dapc_kmeans_loading_LD1$var.names, dapc_kmeans_loading_LD2$var.names) #show differentiated alleles for both LD1 and LD2
## [1] "776727:19:-.0" "776727:19:-.1" "2716161:8:-.0" "2716161:8:-.1"
## [5] "3103167:9:+.1" "3103167:9:+.0"
dapc_kmeans_group_memberships <- as.data.frame(dapc_result_genpop_data_kmeans$posterior) #convert posterior probabilities to data frame
subset_size <- 40 #define subset size
plot_assignments <- function(dapc_kmeans_result, subset_size, max_rows) { #plot group memberships (heat colors represent membership probabilities with red=1 and white=0, blue crosses represent prior Cluster_assignment provided to DAPC kmeans)
num_plots <- ceiling(max_rows / subset_size) #calculate number of plots needed
for (i in 0:(num_plots - 1)) {
start_row <- i * subset_size + 1 #start index for subset
end_row <- min((i + 1) * subset_size, max_rows) #end index for subset
cat("Plotting rows", start_row, "to", end_row, "\n") #print status
adegenet::assignplot(dapc_kmeans_result, subset = start_row:end_row)}} #plot assignments for current subset
par(mar = c(5, 11, 3, 3))
plot_assignments(dapc_result_genpop_data_kmeans, subset_size, nrow(dapc_kmeans_group_memberships)) #plot all subsets
## Plotting rows 1 to 40
## Plotting rows 41 to 80
## Plotting rows 81 to 118
dapc_result_genpop_data_kmeans$grp <- factor(dapc_result_genpop_data_kmeans$grp, levels = unique(dapc_result_genpop_data_kmeans$grp))
admixed_individuals <- which(apply(dapc_result_genpop_data_kmeans$posterior, 1, function(e) all(e < 0.80))) #identify admixed individuals (posterior probabilities less than 0.80)
admixed_individuals #show admixed individuals
## named integer(0)
if (length(admixed_individuals) > 0) {adegenet::compoplot(dapc_result_genpop_data_kmeans, col.pal = cols_pop_DAPC_kmeans, subset = admixed_individuals)} #plot admixed individuals
## Plot map with DAPC assignment probabilities
common_ids <- intersect(rownames(dapc_result_genpop_data_kmeans$posterior), dataset$ID) #extract shared IDs
head(common_ids)
dapc_coordinates_kmeans <- dataset[match(common_ids, dataset$ID), c("Longitude", "Latitude")] #subset and reorder coordinates
head(dapc_coordinates_kmeans)
dapc_anestry_matrix_kmeans <- dapc_result_genpop_data_kmeans$posterior[common_ids, , drop = FALSE]
head(dapc_anestry_matrix_kmeans)
cols_pop_DAPC_kmeans
plot.Map.assig.prob(ancestry_matrix = dapc_anestry_matrix_kmeans,
Coordinates = dapc_coordinates_kmeans,
legend.cluster.names = colnames(dapc_anestry_matrix_kmeans),
lon.buffer.range = 2.5,
lat.buffer.range = 2.5,
legend.position = "bottomright",
save = T,
type = "svg",
file.name = "DAPC_map_kmeans.svg",
pie.size = 2.5,
pie.col.pal = cols_pop_DAPC_kmeans,
overwrite = T,
width = 30,
height = 20,
north.arrow.length = 1.8,
legend.font.size = 1, #font size of legend text
north.arrow.N.position = 0.7,
legend.symbol.size = 1.5,
north.arrow.position = c(0.992, 0.35))
## Plot overwritten to DAPC_map_kmeans.svg